def troubleshoot_server(self): if self.settings['ssh']: if servers[self.server]['cluster_soft'].lower() == 'oge': # delete present server run logging.error( 'Job {name} has server status {stat} on {server}. Troubleshooting by changing node.' .format(name=self.job_name, stat=self.job_status[0], server=self.server)) ssh = SSH_Client(self.server) ssh.send_command_to_server(command=delete_command[servers[ self.server]['cluster_soft']] + ' ' + str(self.job_id)) # find available nodes stdout, _ = ssh.send_command_to_server( command=list_available_nodes_command[servers[self.server] ['cluster_soft']]) for line in stdout: node = line.split()[0].split('.')[0].split('node')[1] if servers[self.server][ 'cluster_soft'] == 'OGE' and '0/0/8' in line and node not in self.server_nodes: self.server_nodes.append(node) break else: logging.error( 'Could not find an available node on the server') # TODO: continue troubleshooting; if all else fails, put job to sleep for x min and try again searching for a node return # modify submit file content = ssh.read_remote_file( remote_path=self.remote_path, filename=submit_filename[servers[self.server] ['cluster_soft']]) for i, line in enumerate(content): if '#$ -l h=node' in line: content[i] = '#$ -l h=node{0}.cluster'.format(node) break else: content.insert(7, '#$ -l h=node{0}.cluster'.format(node)) content = ''.join( content ) # convert list into a single string, not to upset paramico # resubmit ssh.upload_file(remote_file_path=os.path.join( self.remote_path, submit_filename[servers[self.server]['cluster_soft']]), file_string=content) self.run() elif servers[self.server]['cluster_soft'].lower() == 'slurm': # TODO: change node on Slurm # delete present server run logging.error( 'Job {name} has server status {stat} on {server}. Re-running job.' .format(name=self.job_name, stat=self.job_status[0], server=self.server)) ssh = SSH_Client(self.server) ssh.send_command_to_server(command=delete_command[servers[ self.server]['cluster_soft']] + ' ' + str(self.job_id)) # resubmit self.run()
def _upload_submit_file(self): ssh = SSH_Client(self.server) ssh.send_command_to_server( command='mkdir -p {0}'.format(self.remote_path)) remote_file_path = os.path.join( self.remote_path, submit_filename[servers[self.server]['cluster_soft']]) ssh.upload_file(remote_file_path=remote_file_path, file_string=self.submit)
def _upload_input_file(self): ssh = SSH_Client(self.server) ssh.send_command_to_server( command='mkdir -p {0}'.format(self.remote_path)) remote_file_path = os.path.join(self.remote_path, input_filename[self.software]) ssh.upload_file(remote_file_path=remote_file_path, file_string=self.input) self.initial_time = ssh.get_last_modified_time( remote_file_path=remote_file_path)
def _get_additional_job_info(self): """ Download the additional information of stdout and stderr from the server """ lines1, lines2 = list(), list() content = '' ssh = SSH_Client(self.server) cluster_soft = servers[self.server]['cluster_soft'].lower() if cluster_soft in ['oge', 'sge']: remote_file_path = os.path.join(self.remote_path, 'out.txt') local_file_path1 = os.path.join(self.local_path, 'out.txt') try: ssh.download_file(remote_file_path=remote_file_path, local_file_path=local_file_path1) except (TypeError, IOError) as e: logging.warning('Got the following error when trying to download out.txt for {0}:'.format(self.job_name)) logging.warning(e.message) remote_file_path = os.path.join(self.remote_path, 'err.txt') local_file_path2 = os.path.join(self.local_path, 'err.txt') try: ssh.download_file(remote_file_path=remote_file_path, local_file_path=local_file_path2) except (TypeError, IOError) as e: logging.warning('Got the following error when trying to download err.txt for {0}:'.format(self.job_name)) logging.warning(e.message) if os.path.isfile(local_file_path1): with open(local_file_path1, 'r') as f: lines1 = f.readlines() if os.path.isfile(local_file_path2): with open(local_file_path2, 'r') as f: lines2 = f.readlines() content += ''.join([line for line in lines1]) content += '\n' content += ''.join([line for line in lines2]) elif cluster_soft == 'slurm': respond = ssh.send_command_to_server(command='ls -alF', remote_path=self.remote_path) files = list() for line in respond[0][0].splitlines(): files.append(line.split()[-1]) for file in files: if 'slurm' in file and '.out' in file: remote_file_path = os.path.join(self.remote_path, file) local_file_path = os.path.join(self.local_path, file) try: ssh.download_file(remote_file_path=remote_file_path, local_file_path=local_file_path) except (TypeError, IOError) as e: logging.warning('Got the following error when trying to download {0} for {1}:'.format( file, self.job_name)) logging.warning(e.message) if os.path.isfile(local_file_path): with open(local_file_path, 'r') as f: lines1 = f.readlines() content += ''.join([line for line in lines1]) content += '\n' return content