Example #1
0
 def troubleshoot_server(self):
     if self.settings['ssh']:
         if servers[self.server]['cluster_soft'].lower() == 'oge':
             # delete present server run
             logging.error(
                 'Job {name} has server status {stat} on {server}. Troubleshooting by changing node.'
                 .format(name=self.job_name,
                         stat=self.job_status[0],
                         server=self.server))
             ssh = SSH_Client(self.server)
             ssh.send_command_to_server(command=delete_command[servers[
                 self.server]['cluster_soft']] + ' ' + str(self.job_id))
             # find available nodes
             stdout, _ = ssh.send_command_to_server(
                 command=list_available_nodes_command[servers[self.server]
                                                      ['cluster_soft']])
             for line in stdout:
                 node = line.split()[0].split('.')[0].split('node')[1]
                 if servers[self.server][
                         'cluster_soft'] == 'OGE' and '0/0/8' in line and node not in self.server_nodes:
                     self.server_nodes.append(node)
                     break
             else:
                 logging.error(
                     'Could not find an available node on the server')
                 # TODO: continue troubleshooting; if all else fails, put job to sleep for x min and try again searching for a node
                 return
             # modify submit file
             content = ssh.read_remote_file(
                 remote_path=self.remote_path,
                 filename=submit_filename[servers[self.server]
                                          ['cluster_soft']])
             for i, line in enumerate(content):
                 if '#$ -l h=node' in line:
                     content[i] = '#$ -l h=node{0}.cluster'.format(node)
                     break
             else:
                 content.insert(7, '#$ -l h=node{0}.cluster'.format(node))
             content = ''.join(
                 content
             )  # convert list into a single string, not to upset paramico
             # resubmit
             ssh.upload_file(remote_file_path=os.path.join(
                 self.remote_path,
                 submit_filename[servers[self.server]['cluster_soft']]),
                             file_string=content)
             self.run()
         elif servers[self.server]['cluster_soft'].lower() == 'slurm':
             # TODO: change node on Slurm
             # delete present server run
             logging.error(
                 'Job {name} has server status {stat} on {server}. Re-running job.'
                 .format(name=self.job_name,
                         stat=self.job_status[0],
                         server=self.server))
             ssh = SSH_Client(self.server)
             ssh.send_command_to_server(command=delete_command[servers[
                 self.server]['cluster_soft']] + ' ' + str(self.job_id))
             # resubmit
             self.run()
Example #2
0
 def _upload_submit_file(self):
     ssh = SSH_Client(self.server)
     ssh.send_command_to_server(
         command='mkdir -p {0}'.format(self.remote_path))
     remote_file_path = os.path.join(
         self.remote_path,
         submit_filename[servers[self.server]['cluster_soft']])
     ssh.upload_file(remote_file_path=remote_file_path,
                     file_string=self.submit)
Example #3
0
 def _upload_input_file(self):
     ssh = SSH_Client(self.server)
     ssh.send_command_to_server(
         command='mkdir -p {0}'.format(self.remote_path))
     remote_file_path = os.path.join(self.remote_path,
                                     input_filename[self.software])
     ssh.upload_file(remote_file_path=remote_file_path,
                     file_string=self.input)
     self.initial_time = ssh.get_last_modified_time(
         remote_file_path=remote_file_path)
Example #4
0
 def _get_additional_job_info(self):
     """
     Download the additional information of stdout and stderr from the server
     """
     lines1, lines2 = list(), list()
     content = ''
     ssh = SSH_Client(self.server)
     cluster_soft = servers[self.server]['cluster_soft'].lower()
     if cluster_soft in ['oge', 'sge']:
         remote_file_path = os.path.join(self.remote_path, 'out.txt')
         local_file_path1 = os.path.join(self.local_path, 'out.txt')
         try:
             ssh.download_file(remote_file_path=remote_file_path, local_file_path=local_file_path1)
         except (TypeError, IOError) as e:
             logging.warning('Got the following error when trying to download out.txt for {0}:'.format(self.job_name))
             logging.warning(e.message)
         remote_file_path = os.path.join(self.remote_path, 'err.txt')
         local_file_path2 = os.path.join(self.local_path, 'err.txt')
         try:
             ssh.download_file(remote_file_path=remote_file_path, local_file_path=local_file_path2)
         except (TypeError, IOError) as e:
             logging.warning('Got the following error when trying to download err.txt for {0}:'.format(self.job_name))
             logging.warning(e.message)
         if os.path.isfile(local_file_path1):
             with open(local_file_path1, 'r') as f:
                 lines1 = f.readlines()
         if os.path.isfile(local_file_path2):
             with open(local_file_path2, 'r') as f:
                 lines2 = f.readlines()
         content += ''.join([line for line in lines1])
         content += '\n'
         content += ''.join([line for line in lines2])
     elif cluster_soft == 'slurm':
         respond = ssh.send_command_to_server(command='ls -alF', remote_path=self.remote_path)
         files = list()
         for line in respond[0][0].splitlines():
             files.append(line.split()[-1])
         for file in files:
             if 'slurm' in file and '.out' in file:
                 remote_file_path = os.path.join(self.remote_path, file)
                 local_file_path = os.path.join(self.local_path, file)
                 try:
                     ssh.download_file(remote_file_path=remote_file_path, local_file_path=local_file_path)
                 except (TypeError, IOError) as e:
                     logging.warning('Got the following error when trying to download {0} for {1}:'.format(
                         file, self.job_name))
                     logging.warning(e.message)
                 if os.path.isfile(local_file_path):
                     with open(local_file_path, 'r') as f:
                         lines1 = f.readlines()
                 content += ''.join([line for line in lines1])
                 content += '\n'
     return content