def runComputerPartition(self, config, environment, process_group_pid_set=None, stdout=None, stderr=None): self.log("SlapOSControler.runComputerPartition") slap = slapos.slap.slap() slap.registerOpenOrder().request(self.software_profile, partition_reference='testing partition', partition_parameter_kw=config['instance_dict']) command = [config['slapgrid_partition_binary'], config['slapos_config'], '-c', '-v'] slapgrid = subprocess.Popen(command, stdout=stdout, stderr=stderr, close_fds=True, preexec_fn=os.setsid) process_group_pid_set.add(slapgrid.pid) slapgrid.wait() stdout.seek(0) stderr.seek(0) process_group_pid_set.remove(slapgrid.pid) status_dict = {'status_code':slapgrid.returncode, 'command': repr(command), 'stdout':stdout.read(), 'stderr':stderr.read()} stdout.close() stderr.close() return status_dict
def _initializeSlapOSConnection(self): """ Initialize communication with slapos """ slap = slapos.slap.slap() retry = 0 while True: # wait until _hateoas_navigator is loaded. if retry > 100: break slap.initializeConnection( self.slapos_url, self.key_path, self.cert_path, timeout=120, slapgrid_rest_uri=self.slapos_api_rest_url) if getattr(slap, '_hateoas_navigator', None) is None: retry += 1 logger.info( "Fail to load _hateoas_navigator waiting a bit and retry.") time.sleep(30) else: break if getattr(slap, '_hateoas_navigator', None) is None: raise ValueError("Fail to load _hateoas_navigator") supply = slap.registerSupply() order = slap.registerOpenOrder() return slap, supply, order
def requestInstance(config, software_type=None): """ Request the main instance of our environment """ software_type_path = os.path.join(config['etc_dir'], ".software_type.xml") if software_type: # Write it to conf file for later use open(software_type_path, 'w').write(software_type) elif os.path.exists(software_type_path): software_type = open(software_type_path).read().rstrip() else: software_type = 'default' slap = slapos.slap.slap() profile = getCurrentSoftwareReleaseProfile(config) slap.initializeConnection(config['master_url']) param_path = os.path.join(config['etc_dir'], ".parameter.xml") xml_result = readParameters(param_path) partition_parameter_kw = None if type(xml_result) != type('') and 'instance' in xml_result: partition_parameter_kw = xml_result['instance'] return slap.registerOpenOrder().request( profile, partition_reference=getSoftwareReleaseName(config), partition_parameter_kw=partition_parameter_kw, software_type=software_type, filter_kw=None, state=None, shared=False)
def runComputerPartition(self, config, environment, stdout=None, stderr=None): self.log("SlapOSControler.runComputerPartition") slap = slapos.slap.slap() # cloudooo-json is required but this is a hack which should be removed config['instance_dict']['cloudooo-json'] = "{}" slap.registerOpenOrder().request(self.software_profile, partition_reference='testing partition', partition_parameter_kw=config['instance_dict']) # try to run for all partitions as one partition may in theory request another one # this not always is required but curently no way to know how "tree" of partitions # may "expand" for runs in range(0, MAX_PARTIONS): status_dict = self.spawn(config['slapgrid_partition_binary'], '-v', '-c', config['slapos_config'], raise_error_if_fail=False, log_prefix='slapgrid_cp', get_output=False) return status_dict
def main(): """ Note: This code does not test as much as it monitors. The goal is to regularily try to build & instantiate a software release on several machines, to monitor vifib stability and SR stability as time passes (and things once available online become unavailable). Part of this function could be reused to make an actual test bot, testing only when actual changes are committed to a software release, to look for regressions. Note: This code does not connect to any instantiated service, it relies on the presence of a promise section to make instantiation fail until promise is happy. """ parser = argparse.ArgumentParser() parser.add_argument('--pidfile', '-p', help='pidfile preventing parallel ' 'execution.') parser.add_argument('--log', '-l', help='Log file path.') parser.add_argument('--verbose', '-v', help='Be verbose.', action='store_true') parser.add_argument('configuration_file', type=argparse.FileType(), help='Slap Test Agent configuration file.') # Just to keep strong references to AutoSTemp instances key_file_dict = {} def asFilenamePair(key, cert): # Note: python's ssl support only supports fetching key & cert data # from on-disk files. This is why we need to "convert" direct data # into file paths, using temporary files. cert = cert.strip() try: temp_key, temp_cert = key_file_dict[cert] except KeyError: temp_key = AutoSTemp(key.strip()) temp_cert = AutoSTemp(cert) key_file_dict[cert] = (temp_key, temp_cert) return temp_key.name, temp_cert.name args = parser.parse_args() log = args.log formatter = logging.Formatter('%(asctime)s %(message)s') logger = logging.getLogger() if args.verbose: log_level = logging.DEBUG else: log_level = logging.INFO logger.setLevel(log_level) handler = logging.StreamHandler(sys.stdout) handler.setFormatter(formatter) logger.addHandler(handler) if log: handler = logging.FileHandler(log) handler.setFormatter(formatter) logger.addHandler(handler) log_file = open(log) log_file.seek(0, 2) pidfile = args.pidfile if pidfile: setRunning(pidfile) try: section_dict = collections.OrderedDict() configuration = ConfigParser.SafeConfigParser() configuration.readfp(args.configuration_file) for section in configuration.sections(): if section == 'agent': continue section_dict[section] = section_entry_dict = dict( configuration.items(section)) for key in ('request_kw', 'max_install_duration', 'max_destroy_duration', 'max_request_duration', 'max_uninstall_duration', 'computer_list' ): if key in section_entry_dict: try: if isinstance(section_entry_dict[key], str) or \ isinstance(section_entry_dict[key], unicode): section_entry_dict[key] = json.loads( section_entry_dict[key]) except Exception as exc: logger.error("Fail to load %s on %s" % (key, section_entry_dict)) raise if 'key' in section_entry_dict: key_file, cert_file = asFilenamePair(section_entry_dict['key'], section_entry_dict['cert']) section_entry_dict['key'] = key_file section_entry_dict['cert'] = cert_file if "computer_list" in section_entry_dict: section_entry_dict["target_computer"] = \ random.choice(section_entry_dict["computer_list"]) agent_parameter_dict = dict(configuration.items('agent')) # XXX: should node title be auto-generated by installation recipe ? # For example, using computer guid. node_title = agent_parameter_dict['node_title'] test_title = agent_parameter_dict['test_title'] project_title = agent_parameter_dict['project_title'] task_distribution_tool = TaskDistributionTool(agent_parameter_dict[ 'report_url']) master_slap_connection_dict = {} test_result = task_distribution_tool.createTestResult( revision='', test_name_list=section_dict.keys(), node_title=node_title, allow_restart=True, test_title=test_title, project_title=project_title, ) test_result.watcher_period = 300 if log: test_result.addWatch(log, log_file, max_history_bytes=10000) assert test_result is not None test_mapping = TestMap(section_dict) logger.info("Running %s tests in parallel." % \ len(test_mapping.getComputerList())) ran_test_set = set() running_test_dict = {} more_tests = True logger.info('Starting Test Agent run %s ' % node_title) while True: # Get up to parallel_task_count tasks to execute while len(running_test_dict) < len(test_mapping.getComputerList())\ and more_tests: test_mapping.cleanUp() target_computer = test_mapping.getNextComputer([computer \ for _, _, computer in running_test_dict.itervalues()]) test_line = test_result.start( exclude_list= list(ran_test_set) + \ list(test_mapping.getExcludeList(target_computer))) logger.info("Test Line: %s " % test_line) logger.info("Ran Test Set: %s " % ran_test_set) logger.info("Running test dict: %s " % running_test_dict) logger.info("Target Computer: %s " % target_computer) if test_line is None: test_mapping.dropComputer(target_computer) if len(test_mapping.getComputerList()) == 0: more_tests = False continue test_name = test_line.name try: section_entry_dict = section_dict[test_name] except KeyError: # We don't know how to execute this test. Assume it doesn't # exist anymore, and fail it in result. test_line.stop(stderr='This test does not exist on test ' 'node %s' % (node_title, )) continue master_url = section_entry_dict['master_url'] master_slap_connection_key = (master_url, section_entry_dict.get('key')) try: supply, order, rpc = master_slap_connection_dict[ master_slap_connection_key] except KeyError: key = section_entry_dict.get('key') cert = section_entry_dict.get('cert') slap = slapos.slap.slap() slap.initializeConnection(master_url, key, cert) supply = slap.registerSupply() order = slap.registerOpenOrder() assert master_url.startswith('https:') rpc = xmlrpclib.ServerProxy(master_url, allow_none=True, transport=x509Transport( {'key_file': key, 'cert_file': cert})) master_slap_connection_dict[ master_slap_connection_key] = (supply, order, rpc) tester = SoftwareReleaseTester( test_name + '_' + node_title + time.strftime( '_%Y/%m/%d_%H:%M:%S_+0000', time.gmtime()), logger, rpc, supply, order, section_entry_dict['url'], section_entry_dict['target_computer'], section_entry_dict['max_install_duration'], section_entry_dict['max_uninstall_duration'], section_entry_dict.get('request_kw'), section_entry_dict.get('max_request_duration'), section_entry_dict.get('max_destroy_duration'), ) ran_test_set.add(test_name) running_test_dict[test_name] = (test_line, tester, target_computer) if not running_test_dict: break now = time.time() # Synchronise refreshes on watcher period, so it doesn't report a # stalled test node where we are actually still sleeping. # Change test_result.watcher_period outside this loop if you wish # to change sleep duration. next_deadline = now + test_result.watcher_period for section, (test_line, tester, target_computer) in running_test_dict.items(): logger.info('Checking %s: %r...', section, tester) try: deadline = tester.tic(now) except Exception: logger.exception('Test execution fail for %s' % (section)) test_line.stop( test_count=1, error_count=1, failure_count=0, skip_count=0, stderr=traceback.format_exc(), ) del running_test_dict[section] try: tester.teardown() except slapos.slap.NotFoundError: # This exception is ignored because we cannot # Teardown if SR URL do not exist. logger.exception('Fail and not found') pass except Exception: logger.exception('teardown failed, human ' 'assistance needed for cleanup') raise else: logger.info('%r', tester) if deadline is None: # TODO: report how long each step took. logger.info('Test execution finished for %s' % (section)) test_line.stop( test_count=1, error_count=0, failure_count=0, skip_count=0, ) del running_test_dict[section] try: tester.teardown() except slapos.slap.NotFoundError: # This exception is ignored because we cannot # Teardown if SR URL do not exist. logger.exception('Fail and not found') pass except Exception: logger.exception('teardown failed, human ' 'assistance needed for cleanup') raise else: next_deadline = min(deadline, next_deadline) if running_test_dict: to_sleep = next_deadline - time.time() if to_sleep > 0: logger.info('Sleeping %is...', to_sleep) time.sleep(to_sleep) if not test_result.isAlive(): for _, tester, computer_id in running_test_dict.itervalues(): tester.teardown() finally: if pidfile: setFinished(pidfile) # Help interpreter get rid of AutoSTemp instances. key_file_dict.clear()
def main(): """ Note: This code does not test as much as it monitors. The goal is to regularily try to build & instantiate a software release on several machines, to monitor vifib stability and SR stability as time passes (and things once available online become unavailable). Part of this function could be reused to make an actual test bot, testing only when actual changes are committed to a software release, to look for regressions. Note: This code does not connect to any instantiated service, it relies on the presence of a promise section to make instantiation fail until promise is happy. """ parser = argparse.ArgumentParser() parser.add_argument('--pidfile', '-p', help='pidfile preventing parallel ' 'execution.') parser.add_argument('--log', '-l', help='Log file path.') parser.add_argument('--verbose', '-v', help='Be verbose.', action='store_true') parser.add_argument('configuration_file', type=argparse.FileType(), help='Slap Test Agent configuration file.') key_file_dict = {} args = parser.parse_args() log = args.log logger, log_file = getLogger(log, args.verbose) configuration = ConfigParser.SafeConfigParser() configuration.readfp(args.configuration_file) pidfile = args.pidfile if pidfile: setRunning(logger=logger, pidfile=pidfile) try: while True: section_dict = loadConfiguration(configuration, logger) agent_parameter_dict = dict(configuration.items('agent')) task_distributor = TaskDistributor(agent_parameter_dict['report_url'], logger=logger) task_distributor.subscribeNode( node_title=agent_parameter_dict['node_title'], computer_guid="None") test_suite_data = task_distributor.startTestSuite( node_title=agent_parameter_dict['node_title'], computer_guid="None") if type(test_suite_data) == str: # Backward compatiblity test_suite_data = json.loads(test_suite_data) slap_account_key = task_distributor.getSlaposAccountKey() slap_certificate = task_distributor.getSlaposAccountCertificate() master_url = task_distributor.getSlaposUrl() key_file_dict = {} def asFilenamePair(key, cert): # Note: python's ssl support only supports fetching key & cert data # from on-disk files. This is why we need to "convert" direct data # into file paths, using temporary files. cert = cert.strip() try: temp_key, temp_cert = key_file_dict[cert] except KeyError: temp_key = AutoSTemp(key.strip()) temp_cert = AutoSTemp(cert) key_file_dict[cert] = (temp_key, temp_cert) return temp_key.name, temp_cert.name key_file, cert_file = asFilenamePair(slap_account_key, slap_certificate) process_manager = ProcessManager(logger.info) for test_suite in test_suite_data: full_revision_list = getAndUpdateFullRevisionList(test_suite, agent_parameter_dict["working_directory"], logger, process_manager) unit_test_dict = task_distributor.generateConfiguration( test_suite['test_suite_title']) if not len(full_revision_list): # We don't watch git revision but we periodically # run the test, once a day. full_revision_list = ["day=%s" % time.strftime('%Y/%m/%d', time.gmtime())] if type(unit_test_dict) == str: # Backward compatiblity unit_test_dict = json.loads(unit_test_dict) test_result = task_distributor.createTestResult( revision=','.join(full_revision_list), test_name_list=unit_test_dict.keys(), node_title=agent_parameter_dict['node_title'], allow_restart=False, test_title=test_suite['test_suite_title'], project_title=agent_parameter_dict['project_title'], ) if test_result is None: # We already have a test result logger.info('Skiping test for %s, result already available (%s)' % (test_suite['test_suite_title'], ','.join(full_revision_list))) continue test_result.watcher_period = 120 assert test_result is not None if log_file is not None: test_result.addWatch(log, log_file, max_history_bytes=10000) logger.info("Starting to run for %s" % test_result ) test_mapping = TestMap(unit_test_dict) logger.info("Running %s tests in parallel." % \ len(test_mapping.getGroupList())) assert master_url.startswith('https:') slap = slapos.slap.slap() retry = 0 while True: if retry > 100: break # wait until _hateoas_navigator is loaded. slap.initializeConnection( master_url, key_file, cert_file, timeout=120) if getattr(slap, '_hateoas_navigator', None) is None: logger.info("Fail to load _hateoas_navigator waiting a bit and retry.") time.sleep(30) else: break if getattr(slap, '_hateoas_navigator', None) is None: raise ValueError("Fail to load _hateoas_navigator") supply = slap.registerSupply() order = slap.registerOpenOrder() running_test_dict = {} logger.info('Starting Test Agent run %s ' % agent_parameter_dict['node_title']) while True: # Get up to parallel_task_count tasks to execute while len(running_test_dict) < len(test_mapping.getGroupList())\ and (len(test_mapping.getGroupList()) > 0): test_mapping.cleanEmptyGroup() # Select an unused computer to run the test. group = test_mapping.getNextGroup( ignore_list = [group for _, _, group in \ running_test_dict.itervalues()]) # Select a test test_line = test_result.start( exclude_list=list(test_mapping.getExcludeList(group))) logger.info("Test Line: %s " % test_line) logger.info("Ran Test Set: %s " % test_mapping.ran_test_set) logger.info("Running test dict: %s " % running_test_dict) logger.info("Group: %s " % group) if test_line is None: logger.info("Removing Group (empty test line): %s " % group) test_mapping.dropGroup(group) continue test_name = test_line.name try: section_entry_dict = unit_test_dict[test_name] except KeyError: # We don't know how to execute this test. Assume it doesn't # exist anymore, and fail it in result. test_line.stop(stderr='This test does not exist on test ' 'node %s' % (agent_parameter_dict['node_title'], )) continue general_timeout = agent_parameter_dict.get('timeout', 3600) tester = SoftwareReleaseTester( test_name + time.strftime('_%Y/%m/%d_%H:%M:%S_+0000', time.gmtime()), logger, slap, order, supply, section_entry_dict['url'], section_entry_dict.get('supply_computer'), section_entry_dict.get('request_kw'), agent_parameter_dict.get('software_timeout', general_timeout), agent_parameter_dict.get('instance_timeout', general_timeout) ) test_mapping.addRanTest(test_name) running_test_dict[test_name] = (test_line, tester, group) if not running_test_dict: logger.info('No more tests to run...') break now = time.time() # Synchronise refreshes on watcher period, so it doesn't report a # stalled test node where we are actually still sleeping. # Change test_result.watcher_period outside this loop if you wish # to change sleep duration. next_deadline = now + test_result.watcher_period for section, (test_line, tester, group) in running_test_dict.items(): logger.info('Checking %s: %r...', section, tester) try: deadline = tester.tic(now) except ConnectionError: logger.exception('Test execution ConnectionError for %s' % (section)) deadline = next_deadline except Exception: logger.exception('Test execution fail for %s' % (section)) test_line.stop(test_count=1, error_count=1, failure_count=0, skip_count=0, command=tester.getInfo(), stdout=tester.getFormatedLastMessage(), stderr=traceback.format_exc()) del running_test_dict[section] try: tester.teardown() except slapos.slap.NotFoundError: # This exception is ignored because we cannot # Teardown if SR URL do not exist. logger.exception('Fail and not found') pass except Exception: logger.exception('teardown failed, human assistance needed for cleanup') raise else: logger.info('%r' % tester) if deadline is None: # TODO: report how long each step took. logger.info('Test execution finished for %s' % (section)) test_line.stop(test_count=1, error_count=0, failure_count=0, skip_count=0, command=tester.getInfo(), stdout=tester.getFormatedLastMessage()) del running_test_dict[section] try: pass #tester.teardown() except slapos.slap.NotFoundError: # This exception is ignored because we cannot # Teardown if SR URL do not exist. logger.exception('Fail and not found') pass except Exception: logger.exception('teardown failed, human assistance needed for cleanup') raise else: next_deadline = min(deadline, next_deadline) if running_test_dict: to_sleep = next_deadline - time.time() if to_sleep > 0: logger.info('Sleeping %is...', to_sleep) time.sleep(to_sleep) if not test_result.isAlive(): for _, tester, computer_id in running_test_dict.itervalues(): tester.teardown() time.sleep(300) finally: if pidfile: setFinished(pidfile) key_file_dict.clear()
def run(args): config = args[0] for k,v in config['environment'].iteritems(): os.environ[k] = v proxy = None slapgrid = None supervisord_pid_file = os.path.join(config['instance_root'], 'var', 'run', 'supervisord.pid') if os.path.exists(config['proxy_database']): os.unlink(config['proxy_database']) try: proxy = subprocess.Popen([config['slapproxy_binary'], config['slapos_config']], close_fds=True, preexec_fn=os.setsid) process_group_pid_list.append(proxy.pid) slap = slapos.slap.slap() slap.initializeConnection(config['master_url']) while True: try: slap.registerSupply().supply(config['profile_url'], computer_guid=config['computer_id']) except socket.error: time.sleep(1) pass else: break while True: slapgrid = subprocess.Popen([config['slapgrid_software_binary'], '-vc', config['slapos_config']], close_fds=True, preexec_fn=os.setsid) process_group_pid_list.append(slapgrid.pid) slapgrid.wait() if slapgrid.returncode == 0: print 'Software installed properly' break print 'Problem with software installation, trying again' time.sleep(600) computer = slap.registerComputer(config['computer_id']) partition_reference = config['partition_reference'] partition_path = os.path.join(config['instance_root'], partition_reference) if not os.path.exists(partition_path): os.mkdir(partition_path) os.chmod(partition_path, 0750) computer.updateConfiguration(xml_marshaller.dumps({ 'address': config['ipv4_address'], 'instance_root': config['instance_root'], 'netmask': '255.255.255.255', 'partition_list': [{'address_list': [{'addr': config['ipv4_address'], 'netmask': '255.255.255.255'}, {'addr': config['ipv6_address'], 'netmask': 'ffff:ffff:ffff::'}, ], 'path': partition_path, 'reference': partition_reference, 'tap': {'name': partition_reference}, } ], 'reference': config['computer_id'], 'software_root': config['software_root']})) slap.registerOpenOrder().request(config['profile_url'], partition_reference='testing partition', partition_parameter_kw=config['instance_dict']) slapgrid = subprocess.Popen([config['slapgrid_partition_binary'], '-vc', config['slapos_config']], close_fds=True, preexec_fn=os.setsid) slapgrid.wait() if slapgrid.returncode != 0: raise ValueError('Slapgrid instance failed') runUnitTest = os.path.join(partition_path, 'bin', 'runUnitTest') if not os.path.exists(runUnitTest): raise ValueError('No %r provided' % runUnitTest) except: try: if os.path.exists(supervisord_pid_file): os.kill(int(open(supervisord_pid_file).read().strip()), signal.SIGTERM) except: pass raise finally: # Nice way to kill *everything* generated by run process -- process # groups working only in POSIX compilant systems # Exceptions are swallowed during cleanup phase if proxy is not None: os.killpg(proxy.pid, signal.SIGTERM) if os.path.exists(config['proxy_database']): os.unlink(config['proxy_database']) if slapgrid is not None and slapgrid.returncode is None: os.killpg(slapgrid.pid, signal.SIGTERM) try: bot_env = os.environ.copy() bot_env['PATH'] = ':'.join([config['bin_directory']] + bot_env['PATH'].split(':')) for l in config['bot_environment'].split(): k, v = l.split('=') bot_env[k] = v if subprocess.call([config['buildbot_binary'], 'create-slave', '-f', config['working_directory'], config['buildbot_host'], config['slave_name'], config['slave_password']]) != 0: raise ValueError('Buildbot call failed') process_command_list.append([config['buildbot_binary'], 'stop', config['working_directory']]) if os.path.exists(os.path.join(config['working_directory'], 'buildbot.tac.new')): tac = os.path.join(config['working_directory'], 'buildbot.tac') if os.path.exists(tac): os.unlink(tac) os.rename(os.path.join(config['working_directory'], 'buildbot.tac.new'), tac) if subprocess.call([config['buildbot_binary'], 'start', config['working_directory']], env=bot_env) != 0: raise ValueError('Issue during starting buildbot') while True: time.sleep(3600) finally: try: subprocess.call([config['buildbot_binary'], 'stop', config['working_directory']]) except: pass try: if os.path.exists(supervisord_pid_file): os.kill(int(open(supervisord_pid_file).read().strip()), signal.SIGTERM) except: pass