Example #1
0
    def terminal(self, retry_count=1):
        cert_path = self._config.get("ssh_cert", "id_rsa")
        username = self._config.get("username", "root")
        server = self._hostname
        options = dict(StrictHostKeyChecking="no",
                       UserKnownHostsFile="/dev/null")
        ssh = pxssh.pxssh(options=options)
        options_text = f"-i {cert_path}; retry_count={retry_count}"
        self.debug("SSHing into %s@%s with options %s; %s", username, server,
                   json.dumps(options), options_text)

        # Capture failed exceptions to close resources
        try:
            task_messages = (f"attach ssh terminal to host {self._hostname}",
                             f"attach ssh terminal to remote")
            for current in retry(retry_count,
                                 task=task_messages,
                                 logger=self.logger):  # pylint: disable=unexpected-keyword-arg
                try:
                    ssh.login(server, username=username, ssh_key=cert_path)
                except pxssh.ExceptionPxssh as ex:
                    current.failed(ex)
                else:
                    return ssh
        # Close ssh connection before re-throwing exception
        except:
            ssh.close()
            raise
Example #2
0
    def run_sequence(self, ssh, sequence, retry_count=1, timeout=30):
        self.debug("Executing sequence %s with options retry_count=%d",
                   sequence, retry_count)

        # Capture failed exceptions to close resources
        try:
            task_messages = (
                f"execute command sequence to host {self._hostname}",
                f"execute command sequence to remote")
            for current in retry(retry_count,
                                 task=task_messages,
                                 logger=self.logger):  # pylint: disable=unexpected-keyword-arg
                failed = False
                for command in sequence:
                    ssh.sendline(command)
                    output = ""
                    while not ssh.prompt(timeout=timeout):
                        output += ssh.before.decode()
                        ssh.sendcontrol('c')
                    output += ssh.before.decode()

                    ssh.sendline("echo $?")
                    ssh.prompt(timeout=10)

                    result = ssh.before.decode().strip().splitlines()
                    if len(result) > 0:
                        try:
                            exitcode = int(result[-1])
                        except TypeError:
                            self.warning(
                                "Couldn't decode exit code from command %s: %s",
                                command, result)
                            exitcode = 1
                    self.debug("[%s] %s", str(exitcode), output)

                    # If exit code is non-zero, assume failed
                    if exitcode != 0:
                        current.failed(f"command: {command}")
                        failed = True
                        break
                if failed:
                    continue
                else:
                    return

        # Close ssh connection before re-throwing exception
        except:
            ssh.close()
            raise
Example #3
0
    def transfer(self,
                 local_src=None,
                 local_dest=None,
                 remote_path=None,
                 retry_count=1):
        cert_path = self._config.get("ssh_cert", "id_rsa")
        username = self._config.get("username", "root")
        retry_delay = self._config.get("retry_delay", 120)

        prelude = [
            '-o', 'UserKnownHostsFile=/dev/null', '-o',
            'StrictHostKeyChecking=no', '-i', cert_path
        ]
        hoststring = f"{username}@{self._hostname}:{remote_path}"

        to_remote = False
        if local_src is None:
            # Transfer from remote
            args = [*prelude, hoststring, local_dest]
        else:
            # Transfer to remote
            to_remote = True
            args = [*prelude, local_src, hoststring]

        transfer_local = local_src if to_remote else local_dest
        transfer_text = f"'{transfer_local}' {'to' if to_remote else 'from'}"
        self.debug(
            "Transferring file %s %s with options %s; -i %s; retry_delay=%f, retry_count=%d",
            transfer_text, hoststring, json.dumps(prelude), cert_path,
            retry_delay, retry_count)

        task_messages = (f"transfer {transfer_text} host {self._hostname}",
                         f"transfer {transfer_text} remote")
        for current in retry(retry_count,
                             task=task_messages,
                             logger=self.logger):  # pylint: disable=unexpected-keyword-arg
            child = pexpect.spawn(command="scp", args=args)
            child.expect(pexpect.EOF)
            child.close()

            if child.exitstatus == 0:
                # Successful
                return True
            else:
                current.failed(f"exit code ({child.exitstatus})")
Example #4
0
    def login(self, retry_count=5):
        driver = self._driver
        task = "log into Cloudlab"
        for current in retry(retry_count, task=task, logger=self.logger):  # pylint: disable=unexpected-keyword-arg
            driver.get("https://www.cloudlab.us/login.php")
            WebDriverWait(driver,
                          60).until(lambda driver: driver.execute_script(
                              'return document.readyState') == 'complete')

            if 'User Dashboard' in driver.title:
                self._authenticated = True
                return
            elif 'Login' in driver.title:
                self._authenticated = False
            else:
                url, title = driver.current_url, driver.title
                current.failed(f'unknown page reached "{title}" @ {url}"')
                continue

            try:
                driver.find_element(By.NAME, "uid").click()
                driver.find_element(By.NAME, "uid").send_keys(self._username)
                driver.find_element(By.NAME,
                                    "password").send_keys(self._password)
                driver.find_element(By.NAME, "login").click()
            except Exception as ex:
                current.failed("could not interact with login form", ex)
                continue

            WebDriverWait(driver,
                          60).until(lambda driver: driver.execute_script(
                              'return document.readyState') == 'complete')

            if 'User Dashboard' in driver.title:
                self._authenticated = True
                return
            elif 'Login' not in driver.title:
                url, title = driver.current_url, driver.title
                current.failed(f'unknown page reached "{title}" @ {url}"')
Example #5
0
    def execute(self):
        for current in retry(retry_count=5,
                             task=f'executing experiment {self._test.id()}',
                             logger=self.logger):
            try:
                # Attach a remote terminal to the executor host
                self.info("Attaching a remote terminal to the executor host")
                ssh = self.terminal(retry_count=10)

                # Clone the repo
                repo = self._config.get("repo")
                remote_folder = "repo"
                self.info("Cloning the repo %s into remote:%s", repo,
                          remote_folder)

                # Build the git command with optional branch support
                git_command = ["git", "clone"]
                branch = self._config.get("branch", None)
                if branch:
                    git_command.extend(
                        ["--single-branch", "--branch", f'"{branch}"'])
                git_command.extend([f'"{repo}"', remote_folder])

                clone_sequence = [
                    f'sudo rm -rf {remote_folder}', ' '.join(git_command)
                ]
                self.run_sequence(ssh,
                                  sequence=clone_sequence,
                                  retry_count=10,
                                  timeout=120)

                # Copy the config file into place
                experiments_path = self._config.get("experiments_path",
                                                    "experiments")
                self._remote_experiment_path = path.join(
                    remote_folder, experiments_path, self._test.experiment())
                remote_config = self._config.get("remote_config", "config.sh")
                dest_config_path = path.join(self._remote_experiment_path,
                                             "conf", remote_config)
                self.debug(
                    "Copy the config file from remote:%s into place at remote:%s",
                    remote_config, dest_config_path)
                self.run_sequence(
                    ssh,
                    sequence=[f'cp {remote_config} {dest_config_path}'],
                    retry_count=10)

                # Change the working directory to the experiment root (eventual destination of results)
                self.debug("Change the working directory to remote:%s",
                           self._remote_experiment_path)
                self.run_sequence(
                    ssh,
                    sequence=[f'cd {self._remote_experiment_path}'],
                    retry_count=1)

                # Run the primary script
                script_path = './scripts/run.sh'
                self.info("Running primary script at remote:%s", script_path)
                ssh.sendline(script_path)
                while not ssh.prompt(timeout=60):
                    self.debug("\n%s", ssh.before.decode().strip())
                    if ssh.before:
                        ssh.expect(r'.+')
                self.debug("\n%s", ssh.before.decode().strip())
                self.info("Finished primary script")
                ssh.logout()
                return
            except ExitEarly:
                raise
            except Exception as ex:
                current.failed(ex)
Example #6
0
    def provision(self, profile, name=None, expires_in=5, retry_count=5):
        driver = self._driver
        task = f"provision experiment {f'with name {name} ' if name is not None else ''}on Cloudlab"
        for current in retry(retry_count, task=task, logger=self.logger):  # pylint: disable=unexpected-keyword-arg
            try:
                if not self._authenticated:
                    self.login(retry_count=retry_count)
            except Exception as ex:
                current.failed("could not log in", ex)
                continue

            driver.get("https://www.cloudlab.us/instantiate.php")
            WebDriverWait(driver,
                          60).until(lambda driver: driver.execute_script(
                              'return document.readyState') == 'complete')

            # Make sure we're authenticated
            if "Login" in driver.title:
                self._authenticated = False
                try:
                    self.login(retry_count=retry_count)
                except Exception as ex:
                    current.failed("could not log in", ex)
                    continue

            try:
                WebDriverWait(driver, 60).until(
                    expected_conditions.element_to_be_clickable(
                        (By.ID, "change-profile")))
                driver.find_element(By.ID, "change-profile").click()
                try:
                    # Wait for page to select initial profile (otherwise the selection will be cleared)
                    WebDriverWait(driver, 60).until(
                        expected_conditions.presence_of_element_located(
                            (By.CSS_SELECTOR, "li.profile-item.selected")))
                except TimeoutException:
                    # Ignore timeouts here
                    pass
                driver.find_element(By.XPATH,
                                    f"//li[@name='{profile}']").click()
                WebDriverWait(driver, 60).until(
                    expected_conditions.presence_of_element_located((
                        By.XPATH,
                        f"//li[@name='{profile}' and contains(@class, 'selected')]"
                    )))
                driver.find_element(
                    By.XPATH,
                    "//button[contains(text(),'Select Profile')]").click()
                WebDriverWait(driver, 60).until(
                    expected_conditions.element_to_be_clickable(
                        (By.LINK_TEXT, "Next")))
                driver.find_element(By.LINK_TEXT, "Next").click()

                # Set name if given
                if name is not None:
                    driver.find_element(By.ID, "experiment_name").click()
                    driver.find_element(By.ID,
                                        "experiment_name").send_keys(name)

                WebDriverWait(driver, 60).until(
                    expected_conditions.element_to_be_clickable(
                        (By.LINK_TEXT, "Next")))
                driver.find_element(By.LINK_TEXT, "Next").click()
                WebDriverWait(driver, 60).until(
                    expected_conditions.element_to_be_clickable(
                        (By.ID, "experiment_duration")))
                driver.find_element(By.ID, "experiment_duration").click()
                driver.find_element(By.ID, "experiment_duration").clear()
                driver.find_element(By.ID, "experiment_duration").send_keys(
                    str(expires_in))
                WebDriverWait(driver, 60).until(
                    expected_conditions.element_to_be_clickable(
                        (By.LINK_TEXT, "Finish")))
                driver.find_element(By.LINK_TEXT, "Finish").click()
            except Exception as ex:
                current.failed(ex)
                continue

            try:
                # Wait until the info page has been loaded
                WebDriverWait(driver, 60).until(
                    expected_conditions.title_contains("Experiment Status"))
                WebDriverWait(driver,
                              60).until(lambda driver: driver.execute_script(
                                  'return document.readyState') == 'complete')
            except TimeoutException as ex:
                # Can't really clean up if an error ocurrs here, so hope it doesn't
                if 'Login' in driver.title:
                    current.failed('not logged in', ex)
                    self._authenticated = False
                    continue
                elif 'Instantiate' in driver.title:
                    current.failed('still on instantiate page after wait', ex)
                    continue
                else:
                    url, title = driver.current_url, driver.title
                    current.failed(f'unknown page reached "{title}" @ {url}"')
                    continue

            # Consider the experiment provisioned here, so any failures from here on need
            # to be cleaned up (experiment terminated)
            name_xpath = "//td[contains(.,'Name:')]/following-sibling::td"
            WebDriverWait(driver, 60).until(
                expected_conditions.presence_of_element_located(
                    (By.XPATH, name_xpath)))
            WebDriverWait(
                driver, 60).until(lambda driver: driver.find_element_by_xpath(
                    name_xpath).text.strip() != '')
            exp_name = driver.find_element_by_xpath(name_xpath).text
            url_parts = urllib.parse.urlparse(driver.current_url)
            uuid = urllib.parse.parse_qs(url_parts.query).get("uuid")[0]
            experiment = ProvisionedExperiment(uuid=uuid,
                                               name=exp_name,
                                               profile=profile)
            self.info(f"Instantiating experiment {experiment}")

            # Wait on status until "ready" or something else
            status_xpath = "//span[@id='quickvm_status']"
            status = driver.find_element_by_xpath(status_xpath).text
            if status != "ready":
                self.debug(f"Waiting for experiment to become ready")

            failed = False
            while status != "ready":
                try:
                    WebDriverWait(driver, 60).until(
                        expected_conditions.text_to_be_present_in_element(
                            (By.XPATH, status_xpath), "ready"))
                except TimeoutException:
                    status = driver.find_element_by_xpath(status_xpath).text
                    if status == "terminating":
                        # Already terminating; back off for 5 minutes and try again
                        current.failed("experiment is marked as terminating")
                        failed = True
                        break
                    elif status == "ready":
                        break
                    elif status == 'created' or status == 'provisioning' or status == 'booting':
                        # Good; keep waiting
                        continue
                    else:
                        # If "failed" or otherwise, assume failure; need to clean up
                        # Try to extract error
                        cloudlab_error = self.get_error_text()
                        self.error(
                            "Experiment is marked as %s: stopping; trying to terminate. %s",
                            status, self.get_error_text())
                        self.safe_terminate(experiment,
                                            retry_count=retry_count)
                        if "Resource reservation violation" in cloudlab_error:
                            current.failed('resource reservation violation')
                        elif re.search(NOT_ENOUGH_REGEX, cloudlab_error):
                            current.failed('insufficient nodes available')
                        else:
                            current.failed('error during provisioning')
                        failed = True
                        break
                else:
                    status = "ready"
                    break

            if failed or status != "ready":
                continue

            try:
                # Navigate to list panel
                WebDriverWait(driver, 60).until(
                    expected_conditions.visibility_of_element_located(
                        (By.ID, "show_listview_tab")))
                driver.find_element(By.ID, "show_listview_tab").click()
            except (TimeoutException, NoSuchElementException) as ex:
                self.warning(
                    "An error ocurred while attempting to expand the experiment listview"
                )
                error_text = self.get_error_text()
                if error_text:
                    self.warning(error_text)
                current.failed("could not expand the experiment listview!", ex)
                self.debug("Terminating experiment %s", experiment)
                self.safe_terminate(experiment, retry_count=retry_count)
                continue

            # Should be ready here, read hostnames
            ssh_commands = [
                elem.text for elem in driver.find_elements_by_xpath(
                    "//td[@name='sshurl']//kbd")
            ]
            if not ssh_commands:
                current.failed("parsed hostnames list was empty")
                error_text = self.get_error_text()
                if error_text:
                    self.warning(error_text)
                self.debug("Terminating experiment %s", experiment)
                self.safe_terminate(experiment, retry_count=retry_count)
                continue
            hostnames = []
            for ssh_command in ssh_commands:
                match_obj = re.search(SSH_REGEX, ssh_command)
                if match_obj:
                    hostnames.append(match_obj.group(1))

            # Experiment successfully provisioned, hostnames extracted
            return Experiment(experiment.uuid(), experiment.name(),
                              experiment.profile(), hostnames)
Example #7
0
    def terminate(self, experiment, retry_count=5):
        driver = self._driver
        task = f"terminate experiment {experiment} on Cloudlab"
        for current in retry(retry_count, task=task, logger=self.logger):  # pylint: disable=unexpected-keyword-arg
            try:
                if not self._authenticated:
                    self.login(retry_count=retry_count)
            except Exception as ex:
                current.failed("could not log in", ex)
                continue

            driver.get(
                f"https://www.cloudlab.us/status.php?uuid={experiment.uuid()}")
            WebDriverWait(driver,
                          60).until(lambda driver: driver.execute_script(
                              'return document.readyState') == 'complete')
            # Make sure we're authenticated
            if 'Login' in driver.title:
                self._authenticated = False
                try:
                    self.login(retry_count=retry_count)
                except Exception as ex:
                    current.failed("could not log in", ex)
                    continue

            # Expand header if collapsed
            try:
                WebDriverWait(driver, 60).until(
                    expected_conditions.visibility_of_element_located(
                        (By.ID, "terminate_button")))
            except (NoSuchElementException, TimeoutException):
                WebDriverWait(driver, 60).until(
                    expected_conditions.presence_of_element_located(
                        (By.XPATH, "//a[@id='profile_status_toggle']")))
                driver.find_element(
                    By.XPATH, "//a[@id='profile_status_toggle']").click()
                WebDriverWait(driver, 60).until(
                    expected_conditions.visibility_of_element_located(
                        (By.ID, "terminate_button")))
                try:
                    term_button = driver.find_element_by_id("terminate_button")
                except NoSuchElementException as ex:
                    current.failed(
                        f"terminate button could not be found even after expanding",
                        ex)
                    continue

            try:
                # Click terminate and confirm
                WebDriverWait(driver, 240).until(
                    expected_conditions.element_to_be_clickable(
                        (By.ID, "terminate_button")))
                term_button = driver.find_element_by_id("terminate_button")
                term_button.click()
                WebDriverWait(driver, 60).until(
                    expected_conditions.element_to_be_clickable(
                        (By.CSS_SELECTOR, "#terminate_modal #terminate")))
                driver.find_element_by_css_selector(
                    "#terminate_modal #terminate").click()
            except TimeoutError:
                current.failed(
                    "could not wait on terminate pathway to become clickable",
                    ex)
            else:
                self.info("Terminated experiment %s", experiment)
                return
Example #8
0
def run(config: Dict[str, Any], repo_path: str) -> None:
    log.info("Starting automated experiment execution")
    if "tests" not in config or not config["tests"]:
        log.error("No tests found. Exiting")
        return

    if "repo" not in config:
        log.error("No repo found. Exiting")
        return

    # Make local directories
    Path("working").mkdir(exist_ok=True)
    Path("logs").mkdir(exist_ok=True)
    Path("results").mkdir(exist_ok=True)

    # Check for existence of experiments directory
    experiments_dir = path.join(repo_path, config.get("experiments_path", "."))
    if not path.exists(experiments_dir):
        log.error("Experiment directory %s not found", experiments_dir)
        return

    tests = flatten_tests(config)

    # Initialize cloudlab driver
    username = config.get("username")
    if username is None:
        log.error("Cloudlab experiment username not specified")
        return

    # Load Cloudlab password
    if 'password_path' in config:
        password_path = config['password_path']
        try:
            with open(password_path, 'r') as password_file:
                password = password_file.read().strip()
        except IOError as ex:
            log.error("Could not load Cloudlab password file at %s:",
                      password_path)
            log.error(ex)
            return
    else:
        password = getpass.getpass(
            prompt=f'Cloudlab password for {username}: ')

    # Instantiate the driver
    headless = bool(config.get("headless"))
    global cloudlab  # pylint: disable=global-statement, invalid-name
    log.info("Initializing %s cloudlab driver for %s",
             'headless' if headless else 'gui', username)
    cloudlab = Cloudlab(username, password, headless)

    # Attempt to log in
    with cloudlab_lock:
        try:
            log.info("Logging into cloudlab")
            cloudlab.login()
        except ExitEarly:
            return
        except OperationFailed as ex:
            log.error("Could not log into cloudlab:")
            log.error(ex)
            log.error(traceback.format_exc())
            return
        except Exception as ex:
            log.error("Encountered error while logging into cloudlab driver:")
            log.error(ex)
            log.error(traceback.format_exc())
            return
        else:
            log.info("Cloudlab login successful")

    max_concurrency = config.get("max_concurrency", 1)

    for test in tests:
        test_logger = setup_logger(name=test.id(),
                                   inner=log,
                                   prefix=f"[{test.id()}] ")
        try:
            for current in retry(task=f"executing test {test.id()}",
                                 retry_count=5,
                                 logger=test_logger):  # pylint: disable=unexpected-keyword-arg
                # Make sure there aren't more than `max_concurrency` tests executing
                while len(thread_queue) >= max_concurrency:
                    thread_queue[0].join()
                    thread_queue.pop(0)

                if conduct_test(test,
                                current,
                                config,
                                experiments_dir,
                                logger=test_logger):
                    # Move to next test if function returns True
                    break
        except ExitEarly:
            return
        except Exception as ex:
            test_logger.error("failed to conduct test")
            test_logger.error(ex)
            test_logger.error(traceback.format_exc())