def run(self): if self.fine: logging.info('Running job {name} for {label} (fine opt)'.format(name=self.job_name, label=self.species_name)) elif self.pivots: logging.info('Running job {name} for {label} (pivots: {pivots})'.format(name=self.job_name, label=self.species_name, pivots=self.pivots)) else: logging.info('Running job {name} for {label}'.format(name=self.job_name, label=self.species_name)) logging.debug('writing submit script...') self.write_submit_script() logging.debug('writing input file...') self.write_input_file() if self.ess_settings['ssh']: ssh = SSH_Client(self.server) logging.debug('submitting job...') # submit_job returns job server status and job server id try: self.job_status[0], self.job_id = ssh.submit_job(remote_path=self.remote_path) except IndexError: # if the connection broke, the files might not have been uploaded correctly self.write_submit_script() self.write_input_file() self.job_status[0], self.job_id = ssh.submit_job(remote_path=self.remote_path)
def _check_job_server_status(self): """ Possible statuses: `initializing`, `running`, `errored on node xx`, `done` """ if self.ess_settings['ssh']: ssh = SSH_Client(self.server) return ssh.check_job_status(self.job_id)
def delete(self): logging.debug('Deleting job {name} for {label}'.format( name=self.job_name, label=self.species_name)) if self.settings['ssh']: ssh = SSH_Client(self.server) logging.debug('deleting job...') ssh.delete_job(self.job_id)
def _download_output_file(self): ssh = SSH_Client(self.server) remote_file_path = os.path.join(self.remote_path, output_filename[self.software]) local_file_path = os.path.join(self.local_path, 'output.out') ssh.download_file(remote_file_path=remote_file_path, local_file_path=local_file_path) self.final_time = ssh.get_last_modified_time( remote_file_path=remote_file_path) self.determine_run_time() if not os.path.isfile(local_file_path): raise JobError( 'output file for {0} was not downloaded properly'.format( self.job_name))
def _get_additional_job_info(self): """ Download the additional information of stdout and stderr from the server """ lines1, lines2 = list(), list() content = '' ssh = SSH_Client(self.server) cluster_soft = servers[self.server]['cluster_soft'].lower() if cluster_soft in ['oge', 'sge']: remote_file_path = os.path.join(self.remote_path, 'out.txt') local_file_path1 = os.path.join(self.local_path, 'out.txt') try: ssh.download_file(remote_file_path=remote_file_path, local_file_path=local_file_path1) except (TypeError, IOError) as e: logging.warning('Got the following error when trying to download out.txt for {0}:'.format(self.job_name)) logging.warning(e.message) remote_file_path = os.path.join(self.remote_path, 'err.txt') local_file_path2 = os.path.join(self.local_path, 'err.txt') try: ssh.download_file(remote_file_path=remote_file_path, local_file_path=local_file_path2) except (TypeError, IOError) as e: logging.warning('Got the following error when trying to download err.txt for {0}:'.format(self.job_name)) logging.warning(e.message) if os.path.isfile(local_file_path1): with open(local_file_path1, 'r') as f: lines1 = f.readlines() if os.path.isfile(local_file_path2): with open(local_file_path2, 'r') as f: lines2 = f.readlines() content += ''.join([line for line in lines1]) content += '\n' content += ''.join([line for line in lines2]) elif cluster_soft == 'slurm': respond = ssh.send_command_to_server(command='ls -alF', remote_path=self.remote_path) files = list() for line in respond[0][0].splitlines(): files.append(line.split()[-1]) for file in files: if 'slurm' in file and '.out' in file: remote_file_path = os.path.join(self.remote_path, file) local_file_path = os.path.join(self.local_path, file) try: ssh.download_file(remote_file_path=remote_file_path, local_file_path=local_file_path) except (TypeError, IOError) as e: logging.warning('Got the following error when trying to download {0} for {1}:'.format( file, self.job_name)) logging.warning(e.message) if os.path.isfile(local_file_path): with open(local_file_path, 'r') as f: lines1 = f.readlines() content += ''.join([line for line in lines1]) content += '\n' return content
def _upload_input_file(self): ssh = SSH_Client(self.server) ssh.send_command_to_server( command='mkdir -p {0}'.format(self.remote_path)) remote_file_path = os.path.join(self.remote_path, input_filename[self.software]) ssh.upload_file(remote_file_path=remote_file_path, file_string=self.input) self.initial_time = ssh.get_last_modified_time( remote_file_path=remote_file_path)
def _upload_submit_file(self): ssh = SSH_Client(self.server) ssh.send_command_to_server( command='mkdir -p {0}'.format(self.remote_path)) remote_file_path = os.path.join( self.remote_path, submit_filename[servers[self.server]['cluster_soft']]) ssh.upload_file(remote_file_path=remote_file_path, file_string=self.submit)
def troubleshoot_server(self): if self.ess_settings['ssh']: if servers[self.server]['cluster_soft'].lower() == 'oge': # delete present server run logging.error('Job {name} has server status "{stat}" on {server}. Troubleshooting by changing node.'.format( name=self.job_name, stat=self.job_status[0], server=self.server)) ssh = SSH_Client(self.server) ssh.send_command_to_server(command=delete_command[servers[self.server]['cluster_soft']] + ' ' + str(self.job_id)) # find available nodes stdout, _ = ssh.send_command_to_server( command=list_available_nodes_command[servers[self.server]['cluster_soft']]) for line in stdout: node = line.split()[0].split('.')[0].split('node')[1] if servers[self.server]['cluster_soft'] == 'OGE' and '0/0/8' in line and node not in self.server_nodes: self.server_nodes.append(node) break else: logging.error('Could not find an available node on the server') # TODO: continue troubleshooting; if all else fails, put job to sleep for x min and try again searching for a node return # modify submit file content = ssh.read_remote_file(remote_path=self.remote_path, filename=submit_filename[servers[self.server]['cluster_soft']]) for i, line in enumerate(content): if '#$ -l h=node' in line: content[i] = '#$ -l h=node{0}.cluster'.format(node) break else: content.insert(7, '#$ -l h=node{0}.cluster'.format(node)) content = ''.join(content) # convert list into a single string, not to upset paramico # resubmit ssh.upload_file(remote_file_path=os.path.join(self.remote_path, submit_filename[servers[self.server]['cluster_soft']]), file_string=content) self.run() elif servers[self.server]['cluster_soft'].lower() == 'slurm': # TODO: change node on Slurm # delete present server run logging.error('Job {name} has server status "{stat}" on {server}. Re-running job.'.format( name=self.job_name, stat=self.job_status[0], server=self.server)) ssh = SSH_Client(self.server) ssh.send_command_to_server(command=delete_command[servers[self.server]['cluster_soft']] + ' ' + str(self.job_id)) # resubmit self.run()