def __init__(self): ''' Constructor ''' connection = SSHConnection(username="******", machine="alvin.nsc.liu.se", port=22) queue = SLURMQueue() super().__init__(connection, queue)
def __init__(self): ''' Constructor ''' connection = SSHConnection(machine="xvis-m3b.met.no", port=22) queue = SGEQueue() super().__init__(connection, queue)
def __init__(self): ''' Constructor ''' connection = SSHConnection(username="******", machine="vilje.hpc.ntnu.no", port=22) queue = PBSQueue() super().__init__(connection, queue)
def __init__(self, directory, hpc, directory2, remote, remoteUser, remoteDir, dryrun=False): self.dryrun = dryrun self.hpc = HPC.by_name(hpc) self.remote = remote self.remote_user = remoteUser self.remote_dir = remoteDir self.ssh = SSHConnection(remoteUser, remote) self.scpdestination = "{remote}:{remoteDir}".format( remote=self.remote, remoteDir=self.remote_dir) if self.remote_user: self.scpdestination = self.remote_user + '@' + self.scpdestination if dirIsWritable(directory): self.directory = directory if dirIsWritable(directory2): self.directory2 = directory2 else: if (self.dryrun): print("directory2: '{}' not writable and disabled".format( directory2), file=sys.stderr) self.directory2 = "" elif dirIsWritable(directory2): if (self.dryrun): print( "directory: '{}' not writable and disabled, using '{}' as default " .format(directory, directory2), file=sys.stderr) self.directory = directory2 self.directory2 = "" else: raise Exception("{dir1} and {dir2} not writable".format( dir1=directory, dir2=directory2)) workdir = os.path.join(self.directory, self.WORK_DIR) if not os.path.isdir(workdir): os.mkdir(workdir) self.statusfile = os.path.join(self.directory, "snapRemoteRunner_working") # make sure only one instance is running, not failsafe (no flock on lustre, eventually in different directories, but good enough) if (os.path.exists(self.statusfile)): file_modified = datetime.datetime.fromtimestamp( os.lstat(self.statusfile).st_mtime) if (self.dryrun): with open(self.statusfile, 'rt') as fh: msg = fh.read() print("status-file exists at '{}' with:".format( self.statusfile), file=sys.stderr) print(msg, file=sys.stderr) else: if datetime.datetime.now( ) - file_modified > datetime.timedelta(hours=3): # return statusfile if hanging for more than 3 hours print("cleaning up {} after 3 hours".format( self.statusfile), file=sys.stderr) _cleanupFileCallable(self.statusfile)() return else: if not self.dryrun: with open(self.statusfile, 'wt') as fh: atexit.register(_cleanupFileCallable(self.statusfile)) fh.write("working pid: {} on node: {}\n".format( os.getpid(), os.uname().nodename)) if DEBUG: print("working pid: {} on node: {}\n".format( os.getpid(), os.uname().nodename)) self._check_and_unpack_new_files()
class SnapRemoteRunner(): UPLOAD_DIR = 'upload' RUN_DIR = 'runs' REJECTED_DIR = 'rejected' WORK_DIR = 'work' hpc = typed_property("hpc", HPC) ssh = typed_property("ssh", SSHConnection) directory = typed_property("directory", str) directory2 = typed_property("directory2", str) dryrun = typed_property("dryrun", bool) remote = typed_property("remote", str) remote_dir = typed_property("remote_dir", str) remote_user = typed_property("remote_user", str) statusfile = typed_property("statusfile", str) def __init__(self, directory, hpc, directory2, remote, remoteUser, remoteDir, dryrun=False): self.dryrun = dryrun self.hpc = HPC.by_name(hpc) self.remote = remote self.remote_user = remoteUser self.remote_dir = remoteDir self.ssh = SSHConnection(remoteUser, remote) self.scpdestination = "{remote}:{remoteDir}".format( remote=self.remote, remoteDir=self.remote_dir) if self.remote_user: self.scpdestination = self.remote_user + '@' + self.scpdestination if dirIsWritable(directory): self.directory = directory if dirIsWritable(directory2): self.directory2 = directory2 else: if (self.dryrun): print("directory2: '{}' not writable and disabled".format( directory2), file=sys.stderr) self.directory2 = "" elif dirIsWritable(directory2): if (self.dryrun): print( "directory: '{}' not writable and disabled, using '{}' as default " .format(directory, directory2), file=sys.stderr) self.directory = directory2 self.directory2 = "" else: raise Exception("{dir1} and {dir2} not writable".format( dir1=directory, dir2=directory2)) workdir = os.path.join(self.directory, self.WORK_DIR) if not os.path.isdir(workdir): os.mkdir(workdir) self.statusfile = os.path.join(self.directory, "snapRemoteRunner_working") # make sure only one instance is running, not failsafe (no flock on lustre, eventually in different directories, but good enough) if (os.path.exists(self.statusfile)): file_modified = datetime.datetime.fromtimestamp( os.lstat(self.statusfile).st_mtime) if (self.dryrun): with open(self.statusfile, 'rt') as fh: msg = fh.read() print("status-file exists at '{}' with:".format( self.statusfile), file=sys.stderr) print(msg, file=sys.stderr) else: if datetime.datetime.now( ) - file_modified > datetime.timedelta(hours=3): # return statusfile if hanging for more than 3 hours print("cleaning up {} after 3 hours".format( self.statusfile), file=sys.stderr) _cleanupFileCallable(self.statusfile)() return else: if not self.dryrun: with open(self.statusfile, 'wt') as fh: atexit.register(_cleanupFileCallable(self.statusfile)) fh.write("working pid: {} on node: {}\n".format( os.getpid(), os.uname().nodename)) if DEBUG: print("working pid: {} on node: {}\n".format( os.getpid(), os.uname().nodename)) self._check_and_unpack_new_files() def write_status(self, task, tag, msg=""): '''Write a status file to the remote host. All errors here are ignored''' try: return self._write_status(task, tag, msg) except: traceback.print_exc() def _write_status(self, task, tag, msg=""): ''' Old codes from perl: if ($status_number == 100) {$text = ":Getting ARGOS data from server";} if ($status_number == 200) {$text = ":Finished getting ARGOS-data from server";} if ($status_number == 201) {$text = ":Finished running ${model}";} if ($status_number == 202) {$text = ":Finished extracting ${model} data for ARGOS";} if ($status_number == 401) {$text = ":$run_ident" . "_${model}_input does not exist";} if ($status_number == 402) {$text = ":$run_ident" . "_${model}_iso does not exist";} if ($status_number == 403) {$text = ":$run_ident" . "_${model}_src does not exist";} if ($status_number == 404) {$text = ":Inconsistent isotope identification (isotop-navn)";} if ($status_number == 408) {$text = ":Initial time not covered by NWP database";} if ($status_number == 409) {$text = ":${model} output data do not exist";} my $message = "$status_number" . ":" . "$timestamp" . ":" . "$text"; ''' filename = task.status_filename() work_file = os.path.join(self.directory, self.WORK_DIR, filename) timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M') with open(work_file, 'a+') as fh: if (tag == 'downloading'): fh.write("{x}:{ts}::Getting ARGOS data from server\n".format( x=100, ts=timestamp)) elif (tag == 'success'): fh.write( "{x}:{ts}::Finished extracting {model} data for ARGOS\n". format(x=202, ts=timestamp, model=task.model)) elif (tag == 'error'): fh.write("{x}:{ts}::{model} output data do not exist\n".format( x=409, ts=timestamp, model=task.model)) elif (tag == 'running'): fh.write("101:{ts}::running {model}\n".format( ts=timestamp, model=task.model)) elif (tag == 'internal'): fh.write( "{x}:{ts}::internal error, cannot start job in queue in dir '{rundir}'\n" .format(x=500, ts=timestamp, rundir=task.rundir)) else: fh.write("{tag}:{ts} {msg}\n".format(ts=timestamp, tag=tag, msg=msg)) self.ssh.put_files([work_file], self.remote_dir, 30) def _check_and_unpack_new_files(self): '''Download new files from the remote machine to the upload directory. - Move invalid files to rejected. (Wrong name, not containing *ARGOS2*.zip) - Unpack zip-files in project-folder / delete ignore incomplete files. - status for complete and incomplete files - Remove complete files from remote-upload and local upload - create modelruns throws an exception when download / unpack failed unexpectedly ''' remote_files = os.path.join(self.remote_dir, self.UPLOAD_DIR, '*') local_upload = os.path.join(self.directory, self.UPLOAD_DIR) if not os.path.isdir(local_upload): os.mkdir(local_upload) local_rejected = os.path.join(self.directory, self.REJECTED_DIR) if not os.path.isdir(local_rejected): os.mkdir(local_rejected) try: self.ssh.get_files([remote_files], local_upload, 30) except subprocess.CalledProcessError as cpe: # code 1 is generic error, e.g. no files, 2 is connection error if cpe.returncode != 1: raise cpe delete_in_upload = [] if DEBUG: print("checking files in uploaddir: {}".format(local_upload)) for f in os.listdir(local_upload): if DEBUG: print("found file: {}".format(f)) if os.path.isfile(os.path.join(local_upload, f)): m = re.match(r'([\w\-\.:]*)_ARGOS2(.*)\.zip', f) if m: if DEBUG: print("found zip-file: '{}'".format(f)) task = SnapTask(topdir=self.directory, backupdir=self.directory2, zip_file=f, ident=m.group(1), model=m.group(2), scpdestination=self.scpdestination, scpoptions=" ".join(self.ssh.scp_options)) if task.is_complete(reldir=self.UPLOAD_DIR): if DEBUG: print("handling zipfile: {}".format(f)) if not self.dryrun: if task.handle(self.hpc): self.write_status(task, tag='running') else: self.write_status(task, tag='internal') delete_in_upload.append(f) else: self.write_status(task, tag='downloading') else: os.rename(os.path.join(local_upload, f), os.path.join(local_rejected, f)) delete_in_upload.append(f) delete_upload_files = [ os.path.join(self.UPLOAD_DIR, f) for f in delete_in_upload ] if DEBUG: print("deleting remotely: " + ", ".join(delete_upload_files)) if not self.dryrun: self.ssh.syscall('rm', delete_upload_files, 30)