def collect_results(self): """ Collect experiment results. Raises: SrtUtilsException """ logger.info('Collecting experiment results') if not self.is_started: raise SrtUtilsException( 'Experiment has not been started yet. Can not collect results') # This is done to prevent the situation when the experiment is still # running and we are trying to collect results before stopping it if not self.is_stopped: raise SrtUtilsException( 'Experiment is still running. Can not collect results') for task in self.tasks: logging.info(f'Collecting task results: {task}') # This try/except block is needed here in order to collect results # for as much tasks as we can in case of something has failed try: task.obj_runner.collect_results() except SrtUtilsException as error: logger.error( f'Failed to collect task results: {task}. Reason: {error}') continue
def _kill(self): """ Kill process. Raises: SrtUtilsException """ logger.debug(f'Killing process: {self.id}') if not self.is_started: raise SrtUtilsException( 'Process has not been started yet. Kill can not be done' ) if self.is_stopped: return status, _ = self.status if status == Status.idle: return self.process.kill() time.sleep(1) status, _ = self.status if status == Status.running: raise SrtUtilsException(f'Process has not been killed: {self.id}')
def before_collect_results_checks( obj: IObject, process: Process, collect_results_path: pathlib.Path ): """ Helper function which performs prelimenary checks for `LocalRunner` and `RemoteRunner` classes before collecting object results. """ if not process.is_started: raise SrtUtilsException( f'Process has not been started yet: {obj}. ' 'Can not collect results' ) if not process.is_stopped: raise SrtUtilsException( f'Process has not been stopped yet: {obj}, {process}. ' 'Can not collect results' ) # It's expected that at this moment directory # self.collect_results_path already exists, because it is created # in SingleExperimentRunner class if not collect_results_path.exists(): raise SrtUtilsException( 'There was no directory for collecting results created: ' f'{collect_results_path}. Can not collect results' ) # If an object has filepath equal to None, it means there should be # no output file produced if obj.filepath == None: logger.info('There was no output file expected, nothing to collect') return
def _terminate(self): """ Terminate process. Raises: SrtUtilsException """ logger.debug(f'Terminating process: {self.id}') if not self.is_started: raise SrtUtilsException( 'Process has not been started yet. Terminate can not be done' ) if self.is_stopped: return status, _ = self.status if status == Status.idle: return logger.debug('Sending SIGINT/CTRL_C_EVENT signal') sig = signal.CTRL_C_EVENT if sys.platform == 'win32' else signal.SIGINT self.process.send_signal(sig) for i in range(3): time.sleep(1) status, _ = self.status if status == Status.idle: return raise SrtUtilsException(f'Process has not been terminated: {self.id}')
def stop(self): """ Stop process. Raises: SrtUtilsException """ logger.debug(f'Stopping process: {self.id}') if not self.is_started: raise SrtUtilsException( 'Process has not been started yet. Stop can not be done' ) if self.is_stopped: return # NOTE: There is a problem with terminating processes which use SSH # to run a command on a remote server. The problem is in SSH not # forwarding a signal (e.g., SIGINT, SIGTERM). As a result, SSH session # itself terminates and process.poll() returns None, however # an application started from a command continues to work on a remote server. # The solution is to use -t option in order to allocate a pseudo-terminal. # See https://stackoverflow.com/questions/48419781/work-around-ssh-does-not-forward-signal # for details. FIXME: Maybe it is reasonable to add additional check in # clean-up actions that the process is not running on a remote server # ps -A | grep [process_name] # FIXME: However, there is a problem with wrong interpretation of carriage # (\r\n) from pseudo-terminal in this case. Check stdout, it is full of b'\r\n'. # FIXME: Signals may not work on Windows properly. Might be useful # https://stefan.sofa-rockers.org/2013/08/15/handling-sub-process-hierarchies-python-linux-os-x/ try: self._terminate() except SrtUtilsException: logger.error(f'Failed to terminate process: {self.id}') # TODO: (For future) Experiment with this more. If stransmit will not # stop after several terminations, there is a problem, and kill() will # hide this problem in this case. # TODO: (!) There is a problem with tsp, it's actually not killed # however process_is_running(process) becomes False try: self._kill() except SrtUtilsException: logger.error(f'Failed to kill process: {self.id}') raise SrtUtilsException( f'Process has not been stopped: {self.id}' ) self.is_stopped = True
def _create_directory( dirpath: str, username: str, host: str ): """ Create directory on a remote machine via SSH for saving object results before starting the object. Attributes: dirpath: `pathlib.Path` directory path. username: Username on the remote machine to connect througth. host: IP address of the remote machine to connect. Raises: SrtUtilsException """ logger.info( '[RemoteRunner] Creating a directory for saving object results ' f'remotely via SSH. Username: {username}, host: {host}, ' f'dirpath: {dirpath}' ) try: # FIXME: By default Paramiko will attempt to connect to a running # SSH agent (Unix style, e.g. a live SSH_AUTH_SOCK, or Pageant if # one is on Windows). That's why promt for login-password is not # disabled under condition that password is not configured via # connect_kwargs.password with fabric.Connection(host=host, user=username) as c: result = c.run(f'mkdir -p {dirpath}') except paramiko.ssh_exception.SSHException as error: raise SrtUtilsException( f'Directory has not been created: {dirpath}. Exception ' f'occured ({error.__class__.__name__}): {error}. Check that ' 'ssh-agent has been started before running the script' ) except TimeoutError as error: raise SrtUtilsException( f'Directory has not been created: {dirpath}. Exception ' f'occured ({error.__class__.__name__}): {error}. Check that ' 'IP address of the remote machine is correct and the ' 'machine is not down' ) if result.exited != 0: raise SrtUtilsException(f'Directory has not been created: {dirpath}')
def start(self): """ Start single experiment. Raises: SrtUtilsException """ # self.log.info('Starting experiment') logger.info('Starting single experiment') if self.is_started: raise SrtUtilsException( 'Experiment has been started already. Start can not be done') self._create_directory(self.collect_results_path) for task in self.tasks: logging.info(f'Starting task: {task}') task.obj_runner.start() sleep_after_start = task.sleep_after_start if sleep_after_start is not None: logger.info(f'Sleeping {sleep_after_start}s after task start') time.sleep(sleep_after_start) self.is_started = True
def stop(self): """ Stop single experiment. Raises: SrtUtilsException """ logger.info(f'Stopping single experiment') not_stopped_tasks = 0 if not self.is_started: raise SrtUtilsException( 'Experiment has not been started yet. Stop can not be done') if self.is_stopped: logger.info('Experiment has been stopped already. Nothing to do') return logger.info(f'Stopping tasks in reversed order') # By default, stop the tasks in reverse order # TODO: Implement stopping tasks according to the specified stop order. # if self.ignore_stop_order: for task in reversed(self.tasks): logging.info(f'Stopping task: {task}') # This try/except block is needed here in order to stop as much # tasks as we can in case of something has failed try: task.obj_runner.stop() except SrtUtilsException as error: logger.error(f'Failed to stop task: {task}. Reason: {error}') not_stopped_tasks += 1 continue finally: sleep_after_stop = task.sleep_after_stop if sleep_after_stop is not None: logger.info( f'Sleeping {sleep_after_stop}s after task stop') time.sleep(sleep_after_stop) if not_stopped_tasks != 0: raise SrtUtilsException('Not all the tasks have been stopped') self.is_stopped = True
def before_collect_results_checks(obj: IObject, process: Process, collect_results_path: pathlib.Path): """ Helper function which performs prelimenary checks for `LocalRunner` and `RemoteRunner` classes before collecting object results. """ if not process.is_started: raise SrtUtilsException(f'Process has not been started yet: {obj}. ' 'Can not collect results') if not process.is_stopped: raise SrtUtilsException( f'Process has not been stopped yet: {obj}, {process}. ' 'Can not collect results') # It's expected that at this moment directory # self.collect_results_path already exists, because it is created # in SingleExperimentRunner class if not collect_results_path.exists(): raise SrtUtilsException( 'There was no directory for collecting results created: ' f'{collect_results_path}. Can not collect results')
def collect_results(self): """ Collect process results: stderr, stdout. Raises: SrtUtilsException """ if not self.is_started: raise SrtUtilsException( f'Process has not been started yet. ' f'Can not collect results' ) stdout = self.process.stdout.readlines() stderr = self.process.stderr.readlines() return stdout, stderr
def _create_directory(dirpath: pathlib.Path): """ Create a local directory for saving experiment results. Raises: SrtUtilsException """ logger.info( '[SingleExperimentRunner] Creating a local directory for saving ' f'experiment results: {dirpath}') created = create_local_directory(dirpath) if not created: raise SrtUtilsException( 'Directory for saving experiment results already exists: ' f'{dirpath}. Please use non-existing directory name and ' 'start the experiment again. Existing directory contents ' 'will not be deleted')
def create_local_directory(dirpath: pathlib.Path): """ Helper function used to create the directory locally. Attributes: dirpath: `pathlib.Path` directory path. """ if dirpath.exists(): return False # TODO: Debug and improve this in order to catch particular exceptions try: dirpath.mkdir(parents=True) except Exception as error: raise SrtUtilsException( f'Directory has not been created: {dirpath}. Exception ' f'occured ({error.__class__.__name__}): {error}') return True
def clean_up(self): """ Perform cleaning up in case of something has gone wrong during the experiment. Raises: SrtUtilsException """ logger.info('Cleaning up after experiment') not_stopped_tasks = 0 for task in self.tasks: if task.obj_runner.status == Status.running: logging.info(f'Stopping task: {task}') try: task.obj_runner.stop() except SrtUtilsException as error: logger.error( f'Failed to stop task: {task}, retrying to stop ' f'again. Reason: {error}') try: task.obj_runner.stop() except SrtUtilsException as error: logger.error( f'Failed to stop task on the second try: {task}. ' f'Reason: {error}') not_stopped_tasks += 1 continue if not_stopped_tasks != 0: raise SrtUtilsException( 'Not all the tasks have been stopped during cleaning up') self.is_stopped = True
def collect_results(self): """ Before collecting object results, this function creates a local directory `username@host` inside self.collect_results_path directory where the results produced by the object are copied. """ logger.info(f'Collecting object results: {self.obj}, {self.process}') before_collect_results_checks(self.obj, self.process, self.collect_results_path) # If an object has filepath equal to None, it means there should be # no output file produced if self.obj.filepath is None: logger.info( 'There was no output file expected, nothing to collect') return # If an object has filepath defined, it means there should be # an output file produced. However it does not mean that the file # was created successfully, that's why we check whether the filepath exists. with fabric.Connection(host=self.host, user=self.username) as c: if not exists(c, self.obj.filepath): stdout, stderr = self.process.collect_results() raise SrtUtilsException( 'There was no output file produced by the object: ' f'{self.obj}, nothing to collect. Process stdout: ' f'{stdout}. Process stderr: {stderr}') # Create 'username@host' folder to copy produced by the object file # (inside self.collect_results_path directory) destination_dir = self.collect_results_path / f'{self.username}@{self.host}' logger.info('Creating a local directory for copying object results: ' f'{destination_dir}') created = create_local_directory(destination_dir) # if not created: # logger.info( # 'Directory already exists, no need to create: ' # f'{destination_dir}' # ) logger.info(f'Copying object results into: {destination_dir}') filename = self.obj.filepath.name source = self.obj.filepath destination = destination_dir / filename if destination.exists(): raise SrtUtilsException( 'The destination file already exists, there might be a ' f'file created by the other object: {destination}. File ' f'with object results was not copied: {self.obj.filepath}') # TODO: Implement copying files using rsync try: # http://docs.fabfile.org/en/2.3/api/transfer.html with fabric.Connection(host=self.host, user=self.username) as c: result = c.get(source, destination) except OSError as error: raise SrtUtilsException( f'Object results have not been collected: {self.obj.filepath}' f'. Exception occured ({error.__class__.__name__}): {error}. ') except Exception as error: logger.info('Most probably paramiko exception') raise SrtUtilsException( f'Object results have not been collected: {self.obj.filepath}' f'. Exception occured ({error.__class__.__name__}): {error}. ')
def collect_results(self): """ Before collecting object results, this function creates a local directory `local` inside self.collect_results_path directory where the results produced by the object are copied. """ logger.info(f'Collecting object results: {self.obj}, {self.process}') before_collect_results_checks(self.obj, self.process, self.collect_results_path) # If an object has filepath equal to None, it means there should be # no output file produced if self.obj.filepath is None: logger.info( 'There was no output file expected, nothing to collect') return # If an object has filepath defined, it means there should be # an output file produced. However it does not mean that the file # was created successfully, that's why we check whether the filepath exists. if not self.obj.filepath.exists(): stdout, stderr = self.process.collect_results() raise SrtUtilsException( 'There was no output file produced by the object: ' f'{self.obj}, nothing to collect. Process stdout: ' f'{stdout}. Process stderr: {stderr}') # Create 'local' folder to copy produced by the object file # (inside self.collect_results_path directory) destination_dir = self.collect_results_path / 'local' logger.info('Creating a local directory for copying object results: ' f'{destination_dir}') created = create_local_directory(destination_dir) # if not created: # logger.info( # 'Directory already exists, no need to create: ' # f'{destination_dir}' # ) # The code below will raise a FileExistsError if destination already exists. # Technically, this copies a file. To perform a move, simply delete source # after the copy is done. Make sure no exception was raised though. # In case we have several tasks which is runned locally by # LocalRunner runner and in case the tasks have the same names # for the output files, the result might be overwritten. # That's why we do not delete destination file before, instead # we catch FileExistsError exception. That's why it is necessary # to make sure that the file names for different tasks are unique. logger.info(f'Copying object results into: {destination_dir}') filename = self.obj.filepath.name source = self.obj.filepath destination = destination_dir / filename try: with destination.open(mode='xb') as fid: fid.write(source.read_bytes()) except FileExistsError: raise SrtUtilsException( 'The destination file already exists, there might be a ' f'file created by the other object: {destination}. File ' f'with object results was not copied: {self.obj.filepath}')
def start(self): """ Start process. Raises: SrtUtilsException """ logger.debug(f'Starting process') if self.is_started: raise SrtUtilsException( f'Process has been started already: {self.id}. ' 'Start can not be done' ) try: if sys.platform == 'win32': self.process = subprocess.Popen( self.args, stdin =subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=False, creationflags=subprocess.CREATE_NEW_PROCESS_GROUP, bufsize=1 ) else: self.process = subprocess.Popen( self.args, stdin =subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, #universal_newlines=False, bufsize=1 ) self.is_started = True except OSError as error: raise SrtUtilsException( f'Process has not been started: {self.args}. {error}' ) # TODO: Adjust timers # Check that the process has started successfully and has not terminated # because of an error if self.via_ssh: time.sleep(SSH_CONNECTION_TIMEOUT + 1) else: # FIXME: Find a better solution, I changed the time from 1 to 5 s, # cause it was not enough in case of errors with srt-test-messaging # app, e.g. when starting the caller first and there is no listener yet # NOTE: A good thing to consider - what would be in case the child process # finfishes its work earlier than the time specified (5s). It is # important to consider especially in case of fsrt and small files # transmission. time.sleep(5) status, returncode = self.status if status == Status.idle: raise SrtUtilsException( f'Process has not been started: {self.args}, returncode: ' f'{returncode}, stdout: {self.process.stdout.readlines()}, ' f'stderr: {self.process.stderr.readlines()}' ) self.id = self.process.pid