def terminal(self, retry_count=1): cert_path = self._config.get("ssh_cert", "id_rsa") username = self._config.get("username", "root") server = self._hostname options = dict(StrictHostKeyChecking="no", UserKnownHostsFile="/dev/null") ssh = pxssh.pxssh(options=options) options_text = f"-i {cert_path}; retry_count={retry_count}" self.debug("SSHing into %s@%s with options %s; %s", username, server, json.dumps(options), options_text) # Capture failed exceptions to close resources try: task_messages = (f"attach ssh terminal to host {self._hostname}", f"attach ssh terminal to remote") for current in retry(retry_count, task=task_messages, logger=self.logger): # pylint: disable=unexpected-keyword-arg try: ssh.login(server, username=username, ssh_key=cert_path) except pxssh.ExceptionPxssh as ex: current.failed(ex) else: return ssh # Close ssh connection before re-throwing exception except: ssh.close() raise
def run_sequence(self, ssh, sequence, retry_count=1, timeout=30): self.debug("Executing sequence %s with options retry_count=%d", sequence, retry_count) # Capture failed exceptions to close resources try: task_messages = ( f"execute command sequence to host {self._hostname}", f"execute command sequence to remote") for current in retry(retry_count, task=task_messages, logger=self.logger): # pylint: disable=unexpected-keyword-arg failed = False for command in sequence: ssh.sendline(command) output = "" while not ssh.prompt(timeout=timeout): output += ssh.before.decode() ssh.sendcontrol('c') output += ssh.before.decode() ssh.sendline("echo $?") ssh.prompt(timeout=10) result = ssh.before.decode().strip().splitlines() if len(result) > 0: try: exitcode = int(result[-1]) except TypeError: self.warning( "Couldn't decode exit code from command %s: %s", command, result) exitcode = 1 self.debug("[%s] %s", str(exitcode), output) # If exit code is non-zero, assume failed if exitcode != 0: current.failed(f"command: {command}") failed = True break if failed: continue else: return # Close ssh connection before re-throwing exception except: ssh.close() raise
def transfer(self, local_src=None, local_dest=None, remote_path=None, retry_count=1): cert_path = self._config.get("ssh_cert", "id_rsa") username = self._config.get("username", "root") retry_delay = self._config.get("retry_delay", 120) prelude = [ '-o', 'UserKnownHostsFile=/dev/null', '-o', 'StrictHostKeyChecking=no', '-i', cert_path ] hoststring = f"{username}@{self._hostname}:{remote_path}" to_remote = False if local_src is None: # Transfer from remote args = [*prelude, hoststring, local_dest] else: # Transfer to remote to_remote = True args = [*prelude, local_src, hoststring] transfer_local = local_src if to_remote else local_dest transfer_text = f"'{transfer_local}' {'to' if to_remote else 'from'}" self.debug( "Transferring file %s %s with options %s; -i %s; retry_delay=%f, retry_count=%d", transfer_text, hoststring, json.dumps(prelude), cert_path, retry_delay, retry_count) task_messages = (f"transfer {transfer_text} host {self._hostname}", f"transfer {transfer_text} remote") for current in retry(retry_count, task=task_messages, logger=self.logger): # pylint: disable=unexpected-keyword-arg child = pexpect.spawn(command="scp", args=args) child.expect(pexpect.EOF) child.close() if child.exitstatus == 0: # Successful return True else: current.failed(f"exit code ({child.exitstatus})")
def login(self, retry_count=5): driver = self._driver task = "log into Cloudlab" for current in retry(retry_count, task=task, logger=self.logger): # pylint: disable=unexpected-keyword-arg driver.get("https://www.cloudlab.us/login.php") WebDriverWait(driver, 60).until(lambda driver: driver.execute_script( 'return document.readyState') == 'complete') if 'User Dashboard' in driver.title: self._authenticated = True return elif 'Login' in driver.title: self._authenticated = False else: url, title = driver.current_url, driver.title current.failed(f'unknown page reached "{title}" @ {url}"') continue try: driver.find_element(By.NAME, "uid").click() driver.find_element(By.NAME, "uid").send_keys(self._username) driver.find_element(By.NAME, "password").send_keys(self._password) driver.find_element(By.NAME, "login").click() except Exception as ex: current.failed("could not interact with login form", ex) continue WebDriverWait(driver, 60).until(lambda driver: driver.execute_script( 'return document.readyState') == 'complete') if 'User Dashboard' in driver.title: self._authenticated = True return elif 'Login' not in driver.title: url, title = driver.current_url, driver.title current.failed(f'unknown page reached "{title}" @ {url}"')
def execute(self): for current in retry(retry_count=5, task=f'executing experiment {self._test.id()}', logger=self.logger): try: # Attach a remote terminal to the executor host self.info("Attaching a remote terminal to the executor host") ssh = self.terminal(retry_count=10) # Clone the repo repo = self._config.get("repo") remote_folder = "repo" self.info("Cloning the repo %s into remote:%s", repo, remote_folder) # Build the git command with optional branch support git_command = ["git", "clone"] branch = self._config.get("branch", None) if branch: git_command.extend( ["--single-branch", "--branch", f'"{branch}"']) git_command.extend([f'"{repo}"', remote_folder]) clone_sequence = [ f'sudo rm -rf {remote_folder}', ' '.join(git_command) ] self.run_sequence(ssh, sequence=clone_sequence, retry_count=10, timeout=120) # Copy the config file into place experiments_path = self._config.get("experiments_path", "experiments") self._remote_experiment_path = path.join( remote_folder, experiments_path, self._test.experiment()) remote_config = self._config.get("remote_config", "config.sh") dest_config_path = path.join(self._remote_experiment_path, "conf", remote_config) self.debug( "Copy the config file from remote:%s into place at remote:%s", remote_config, dest_config_path) self.run_sequence( ssh, sequence=[f'cp {remote_config} {dest_config_path}'], retry_count=10) # Change the working directory to the experiment root (eventual destination of results) self.debug("Change the working directory to remote:%s", self._remote_experiment_path) self.run_sequence( ssh, sequence=[f'cd {self._remote_experiment_path}'], retry_count=1) # Run the primary script script_path = './scripts/run.sh' self.info("Running primary script at remote:%s", script_path) ssh.sendline(script_path) while not ssh.prompt(timeout=60): self.debug("\n%s", ssh.before.decode().strip()) if ssh.before: ssh.expect(r'.+') self.debug("\n%s", ssh.before.decode().strip()) self.info("Finished primary script") ssh.logout() return except ExitEarly: raise except Exception as ex: current.failed(ex)
def provision(self, profile, name=None, expires_in=5, retry_count=5): driver = self._driver task = f"provision experiment {f'with name {name} ' if name is not None else ''}on Cloudlab" for current in retry(retry_count, task=task, logger=self.logger): # pylint: disable=unexpected-keyword-arg try: if not self._authenticated: self.login(retry_count=retry_count) except Exception as ex: current.failed("could not log in", ex) continue driver.get("https://www.cloudlab.us/instantiate.php") WebDriverWait(driver, 60).until(lambda driver: driver.execute_script( 'return document.readyState') == 'complete') # Make sure we're authenticated if "Login" in driver.title: self._authenticated = False try: self.login(retry_count=retry_count) except Exception as ex: current.failed("could not log in", ex) continue try: WebDriverWait(driver, 60).until( expected_conditions.element_to_be_clickable( (By.ID, "change-profile"))) driver.find_element(By.ID, "change-profile").click() try: # Wait for page to select initial profile (otherwise the selection will be cleared) WebDriverWait(driver, 60).until( expected_conditions.presence_of_element_located( (By.CSS_SELECTOR, "li.profile-item.selected"))) except TimeoutException: # Ignore timeouts here pass driver.find_element(By.XPATH, f"//li[@name='{profile}']").click() WebDriverWait(driver, 60).until( expected_conditions.presence_of_element_located(( By.XPATH, f"//li[@name='{profile}' and contains(@class, 'selected')]" ))) driver.find_element( By.XPATH, "//button[contains(text(),'Select Profile')]").click() WebDriverWait(driver, 60).until( expected_conditions.element_to_be_clickable( (By.LINK_TEXT, "Next"))) driver.find_element(By.LINK_TEXT, "Next").click() # Set name if given if name is not None: driver.find_element(By.ID, "experiment_name").click() driver.find_element(By.ID, "experiment_name").send_keys(name) WebDriverWait(driver, 60).until( expected_conditions.element_to_be_clickable( (By.LINK_TEXT, "Next"))) driver.find_element(By.LINK_TEXT, "Next").click() WebDriverWait(driver, 60).until( expected_conditions.element_to_be_clickable( (By.ID, "experiment_duration"))) driver.find_element(By.ID, "experiment_duration").click() driver.find_element(By.ID, "experiment_duration").clear() driver.find_element(By.ID, "experiment_duration").send_keys( str(expires_in)) WebDriverWait(driver, 60).until( expected_conditions.element_to_be_clickable( (By.LINK_TEXT, "Finish"))) driver.find_element(By.LINK_TEXT, "Finish").click() except Exception as ex: current.failed(ex) continue try: # Wait until the info page has been loaded WebDriverWait(driver, 60).until( expected_conditions.title_contains("Experiment Status")) WebDriverWait(driver, 60).until(lambda driver: driver.execute_script( 'return document.readyState') == 'complete') except TimeoutException as ex: # Can't really clean up if an error ocurrs here, so hope it doesn't if 'Login' in driver.title: current.failed('not logged in', ex) self._authenticated = False continue elif 'Instantiate' in driver.title: current.failed('still on instantiate page after wait', ex) continue else: url, title = driver.current_url, driver.title current.failed(f'unknown page reached "{title}" @ {url}"') continue # Consider the experiment provisioned here, so any failures from here on need # to be cleaned up (experiment terminated) name_xpath = "//td[contains(.,'Name:')]/following-sibling::td" WebDriverWait(driver, 60).until( expected_conditions.presence_of_element_located( (By.XPATH, name_xpath))) WebDriverWait( driver, 60).until(lambda driver: driver.find_element_by_xpath( name_xpath).text.strip() != '') exp_name = driver.find_element_by_xpath(name_xpath).text url_parts = urllib.parse.urlparse(driver.current_url) uuid = urllib.parse.parse_qs(url_parts.query).get("uuid")[0] experiment = ProvisionedExperiment(uuid=uuid, name=exp_name, profile=profile) self.info(f"Instantiating experiment {experiment}") # Wait on status until "ready" or something else status_xpath = "//span[@id='quickvm_status']" status = driver.find_element_by_xpath(status_xpath).text if status != "ready": self.debug(f"Waiting for experiment to become ready") failed = False while status != "ready": try: WebDriverWait(driver, 60).until( expected_conditions.text_to_be_present_in_element( (By.XPATH, status_xpath), "ready")) except TimeoutException: status = driver.find_element_by_xpath(status_xpath).text if status == "terminating": # Already terminating; back off for 5 minutes and try again current.failed("experiment is marked as terminating") failed = True break elif status == "ready": break elif status == 'created' or status == 'provisioning' or status == 'booting': # Good; keep waiting continue else: # If "failed" or otherwise, assume failure; need to clean up # Try to extract error cloudlab_error = self.get_error_text() self.error( "Experiment is marked as %s: stopping; trying to terminate. %s", status, self.get_error_text()) self.safe_terminate(experiment, retry_count=retry_count) if "Resource reservation violation" in cloudlab_error: current.failed('resource reservation violation') elif re.search(NOT_ENOUGH_REGEX, cloudlab_error): current.failed('insufficient nodes available') else: current.failed('error during provisioning') failed = True break else: status = "ready" break if failed or status != "ready": continue try: # Navigate to list panel WebDriverWait(driver, 60).until( expected_conditions.visibility_of_element_located( (By.ID, "show_listview_tab"))) driver.find_element(By.ID, "show_listview_tab").click() except (TimeoutException, NoSuchElementException) as ex: self.warning( "An error ocurred while attempting to expand the experiment listview" ) error_text = self.get_error_text() if error_text: self.warning(error_text) current.failed("could not expand the experiment listview!", ex) self.debug("Terminating experiment %s", experiment) self.safe_terminate(experiment, retry_count=retry_count) continue # Should be ready here, read hostnames ssh_commands = [ elem.text for elem in driver.find_elements_by_xpath( "//td[@name='sshurl']//kbd") ] if not ssh_commands: current.failed("parsed hostnames list was empty") error_text = self.get_error_text() if error_text: self.warning(error_text) self.debug("Terminating experiment %s", experiment) self.safe_terminate(experiment, retry_count=retry_count) continue hostnames = [] for ssh_command in ssh_commands: match_obj = re.search(SSH_REGEX, ssh_command) if match_obj: hostnames.append(match_obj.group(1)) # Experiment successfully provisioned, hostnames extracted return Experiment(experiment.uuid(), experiment.name(), experiment.profile(), hostnames)
def terminate(self, experiment, retry_count=5): driver = self._driver task = f"terminate experiment {experiment} on Cloudlab" for current in retry(retry_count, task=task, logger=self.logger): # pylint: disable=unexpected-keyword-arg try: if not self._authenticated: self.login(retry_count=retry_count) except Exception as ex: current.failed("could not log in", ex) continue driver.get( f"https://www.cloudlab.us/status.php?uuid={experiment.uuid()}") WebDriverWait(driver, 60).until(lambda driver: driver.execute_script( 'return document.readyState') == 'complete') # Make sure we're authenticated if 'Login' in driver.title: self._authenticated = False try: self.login(retry_count=retry_count) except Exception as ex: current.failed("could not log in", ex) continue # Expand header if collapsed try: WebDriverWait(driver, 60).until( expected_conditions.visibility_of_element_located( (By.ID, "terminate_button"))) except (NoSuchElementException, TimeoutException): WebDriverWait(driver, 60).until( expected_conditions.presence_of_element_located( (By.XPATH, "//a[@id='profile_status_toggle']"))) driver.find_element( By.XPATH, "//a[@id='profile_status_toggle']").click() WebDriverWait(driver, 60).until( expected_conditions.visibility_of_element_located( (By.ID, "terminate_button"))) try: term_button = driver.find_element_by_id("terminate_button") except NoSuchElementException as ex: current.failed( f"terminate button could not be found even after expanding", ex) continue try: # Click terminate and confirm WebDriverWait(driver, 240).until( expected_conditions.element_to_be_clickable( (By.ID, "terminate_button"))) term_button = driver.find_element_by_id("terminate_button") term_button.click() WebDriverWait(driver, 60).until( expected_conditions.element_to_be_clickable( (By.CSS_SELECTOR, "#terminate_modal #terminate"))) driver.find_element_by_css_selector( "#terminate_modal #terminate").click() except TimeoutError: current.failed( "could not wait on terminate pathway to become clickable", ex) else: self.info("Terminated experiment %s", experiment) return
def run(config: Dict[str, Any], repo_path: str) -> None: log.info("Starting automated experiment execution") if "tests" not in config or not config["tests"]: log.error("No tests found. Exiting") return if "repo" not in config: log.error("No repo found. Exiting") return # Make local directories Path("working").mkdir(exist_ok=True) Path("logs").mkdir(exist_ok=True) Path("results").mkdir(exist_ok=True) # Check for existence of experiments directory experiments_dir = path.join(repo_path, config.get("experiments_path", ".")) if not path.exists(experiments_dir): log.error("Experiment directory %s not found", experiments_dir) return tests = flatten_tests(config) # Initialize cloudlab driver username = config.get("username") if username is None: log.error("Cloudlab experiment username not specified") return # Load Cloudlab password if 'password_path' in config: password_path = config['password_path'] try: with open(password_path, 'r') as password_file: password = password_file.read().strip() except IOError as ex: log.error("Could not load Cloudlab password file at %s:", password_path) log.error(ex) return else: password = getpass.getpass( prompt=f'Cloudlab password for {username}: ') # Instantiate the driver headless = bool(config.get("headless")) global cloudlab # pylint: disable=global-statement, invalid-name log.info("Initializing %s cloudlab driver for %s", 'headless' if headless else 'gui', username) cloudlab = Cloudlab(username, password, headless) # Attempt to log in with cloudlab_lock: try: log.info("Logging into cloudlab") cloudlab.login() except ExitEarly: return except OperationFailed as ex: log.error("Could not log into cloudlab:") log.error(ex) log.error(traceback.format_exc()) return except Exception as ex: log.error("Encountered error while logging into cloudlab driver:") log.error(ex) log.error(traceback.format_exc()) return else: log.info("Cloudlab login successful") max_concurrency = config.get("max_concurrency", 1) for test in tests: test_logger = setup_logger(name=test.id(), inner=log, prefix=f"[{test.id()}] ") try: for current in retry(task=f"executing test {test.id()}", retry_count=5, logger=test_logger): # pylint: disable=unexpected-keyword-arg # Make sure there aren't more than `max_concurrency` tests executing while len(thread_queue) >= max_concurrency: thread_queue[0].join() thread_queue.pop(0) if conduct_test(test, current, config, experiments_dir, logger=test_logger): # Move to next test if function returns True break except ExitEarly: return except Exception as ex: test_logger.error("failed to conduct test") test_logger.error(ex) test_logger.error(traceback.format_exc())