class DirectConnection(Connection): '''no connection, working directly on that machine ''' charset = typed_property("charset", str) '''charset of stdout of the machine, usually utf-8''' def __init__(self): super().__init__() self.charset = "utf-8" return def put_files(self, files, remote_path, timeout=None): for f in files: shutil.copy2(f, remote_path) return True def get_files(self, files, local_path=None, timeout=None): if not local_path: local_path = "." for f in files: shutil.copy2(f, local_path) return True def syscall(self, program, args, timeout=None): if sys.version_info > (3, 5, 0): proc = subprocess.run([program] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout) return (proc.stdout.decode(self.charset), proc.stderr.decode(self.charset), proc.returncode) else: try: output = subprocess.check_output([program] + args, timeout=timeout) return (output.decode(self.charset), '', 0) except subprocess.CalledProcessError as cpe: return (cpe.output.decode(self.charset), '', cpe.returncode)
class SLURMQJob(QJob): jobid = typed_property("jobid", str) def __init__(self, jobid): super().__init__() self.jobid = jobid
class SSHConnection(Connection): '''connection via ssh besides the main options username, machine and port, the user can set special attributes ''' username = typed_property("username", str) '''name of the user on the remote machine, None possible''' machine = typed_property("machine", str) '''name or IP-address of the remote machine''' remote_charset = typed_property("remote_charset", str) '''charset of stdout of the remote machine, usually utf-8''' port = typed_property("port", int) '''port to connect on the remote machine, None possible''' ssh_command = typed_property("ssh_command", str) '''command to use for ssh-connections, usually just 'ssh' for the ssh command in the PATH''' scp_command = typed_property("scp_command", str) '''command to use for scp-connections, usually just 'scp' for the scp command in the PATH''' ssh_options = typed_property("ssh_options", list) '''additional options to add to ssh''' scp_options = typed_property("scp_options", list) '''additional options to add to scp''' def __init__(self, username=None, machine="localhost", port=None): super().__init__() self.username = username self.machine = machine self.remote_charset = "utf-8" self.port = port self.ssh_command = "ssh" self.scp_command = "scp" self.scp_options = ["-o", "ConnectTimeout=20", "-o", "Batchmode=yes", "-o", "StrictHostKeyChecking=no", "-q", "-p"] self.ssh_options = ["-o", "ConnectTimeout=20", "-o", "Batchmode=yes", "-o", "StrictHostKeyChecking=no"] return def _build_scp_args(self): args = [self.scp_command] args.extend(self.scp_options) if self.port is not None: args.extend(["-P", "{}".format(self.port)]) return args def _build_ssh_args(self): args = [self.ssh_command] args.extend(self.ssh_options) if self.port is not None: args.extend(["-p", "{}".format(self.port)]) if self.username is not None: args.extend(["-l", self.username]) args.append(self.machine) return args def put_files(self, files, remote_path, timeout=None): args = self._build_scp_args() args.extend(files) user = "" if self.username is not None: user = self.username + '@' args.append("{user}{machine}:{path}".format(user=user, machine=self.machine, path=remote_path)) if sys.version_info > (3, 5, 0): proc = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout) proc.check_returncode() else: subprocess.check_output(args, timeout=timeout) return True def get_files(self, files, local_path=None, timeout=None): args = self._build_scp_args() user = "" if self.username is not None: user = self.username + '@' for file in files: args.append("{user}{machine}:{path}".format(user=user, machine=self.machine, path=file)) if local_path is None: local_path = "." args.append(local_path) if sys.version_info > (3, 5, 0): proc = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout) proc.check_returncode() else: subprocess.check_output(args, timeout=timeout) return True def syscall(self, program, args, timeout=None): ssh_args = self._build_ssh_args() args.insert(0, program) args = [ shlex.quote(a) for a in args ] # print(args) remote_command = " ".join(args) ssh_args.append(remote_command) if sys.version_info > (3, 5, 0): proc = subprocess.run(ssh_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout) return (proc.stdout.decode(self.remote_charset), proc.stderr.decode(self.remote_charset), proc.returncode) else: try: output = subprocess.check_output(ssh_args, timeout=timeout) return (output.decode(self.remote_charset), '', 0) except subprocess.CalledProcessError as cpe: return (cpe.output.decode(self.remote_charset), '', cpe.returncode)
class SnapTask(): topdir = typed_property('topdir', str) backupdir = typed_property('backupdir', str) zipfile = typed_property('zipfile', str) model = typed_property('model', str) id = typed_property('id', str) scpdestination = typed_property('scpdestination', str) timestamp = typed_property('timestamp', datetime.datetime) rundir = typed_property('rundir', str) def __init__(self, topdir, backupdir, zip_file, model, ident, scpdestination, scpoptions): self.topdir = topdir self.backupdir = backupdir self.zipfile = zip_file self.model = model self.id = ident self.scpdestination = scpdestination self.scpoptions = scpoptions self.timestamp = datetime.datetime.now() def status_filename(self): return "{ident}_{model}_status".format(ident=self.id, model=self.model) def is_complete(self, reldir): infile = os.path.join(self.topdir, reldir, self.zipfile) try: with zipfile.ZipFile(infile, 'r') as zf: if zf.testzip() == None: return True except: pass return False def handle(self, hpc): ''' Handle the job on the hpc. HPC directories must be writable locally. Return True if job is submitted ''' retval = False try: retval = self._handle(hpc) except: traceback.print_exc() return retval def _handle(self, hpc): top_rundir = os.path.join(self.topdir, SnapRemoteRunner.RUN_DIR) if not os.path.isdir(top_rundir): os.mkdir(top_rundir) self.rundir = os.path.join( top_rundir, "{dt}_{ident}".format( dt=self.timestamp.strftime('%Y-%m-%dT%H%M%S'), ident=self.id)) os.mkdir(self.rundir) infile = os.path.join(self.topdir, SnapRemoteRunner.UPLOAD_DIR, self.zipfile) workfile = os.path.join(self.rundir, self.zipfile) os.rename(infile, workfile) with zipfile.ZipFile(workfile, 'r') as zf: # files = zf.namelist() zf.extractall(path=self.rundir) # start a remote detached qsub job snapJob = SnapJobEC(self, hpc) jobscript = snapJob.job_script() jobfile = os.path.join(self.rundir, 'snap.job') with open(jobfile, 'w') as jh: jh.write(jobscript) if self.backupdir: back_rundir = os.path.join(self.backupdir, SnapRemoteRunner.RUN_DIR) jh.write(''' # create files in backup directory mkdir {back_rundir} rsync -av {rundir} {back_rundir} '''.format(back_rundir=back_rundir, rundir=self.rundir)) # push the job into the queue, no feedback qjob = hpc.submit_job(jobfile, args=[]) if (qjob == None): return False return True
class SnapRemoteRunner(): UPLOAD_DIR = 'upload' RUN_DIR = 'runs' REJECTED_DIR = 'rejected' WORK_DIR = 'work' hpc = typed_property("hpc", HPC) ssh = typed_property("ssh", SSHConnection) directory = typed_property("directory", str) directory2 = typed_property("directory2", str) dryrun = typed_property("dryrun", bool) remote = typed_property("remote", str) remote_dir = typed_property("remote_dir", str) remote_user = typed_property("remote_user", str) statusfile = typed_property("statusfile", str) def __init__(self, directory, hpc, directory2, remote, remoteUser, remoteDir, dryrun=False): self.dryrun = dryrun self.hpc = HPC.by_name(hpc) self.remote = remote self.remote_user = remoteUser self.remote_dir = remoteDir self.ssh = SSHConnection(remoteUser, remote) self.scpdestination = "{remote}:{remoteDir}".format( remote=self.remote, remoteDir=self.remote_dir) if self.remote_user: self.scpdestination = self.remote_user + '@' + self.scpdestination if dirIsWritable(directory): self.directory = directory if dirIsWritable(directory2): self.directory2 = directory2 else: if (self.dryrun): print("directory2: '{}' not writable and disabled".format( directory2), file=sys.stderr) self.directory2 = "" elif dirIsWritable(directory2): if (self.dryrun): print( "directory: '{}' not writable and disabled, using '{}' as default " .format(directory, directory2), file=sys.stderr) self.directory = directory2 self.directory2 = "" else: raise Exception("{dir1} and {dir2} not writable".format( dir1=directory, dir2=directory2)) workdir = os.path.join(self.directory, self.WORK_DIR) if not os.path.isdir(workdir): os.mkdir(workdir) self.statusfile = os.path.join(self.directory, "snapRemoteRunner_working") # make sure only one instance is running, not failsafe (no flock on lustre, eventually in different directories, but good enough) if (os.path.exists(self.statusfile)): file_modified = datetime.datetime.fromtimestamp( os.lstat(self.statusfile).st_mtime) if (self.dryrun): with open(self.statusfile, 'rt') as fh: msg = fh.read() print("status-file exists at '{}' with:".format( self.statusfile), file=sys.stderr) print(msg, file=sys.stderr) else: if datetime.datetime.now( ) - file_modified > datetime.timedelta(hours=3): # return statusfile if hanging for more than 3 hours print("cleaning up {} after 3 hours".format( self.statusfile), file=sys.stderr) _cleanupFileCallable(self.statusfile)() return else: if not self.dryrun: with open(self.statusfile, 'wt') as fh: atexit.register(_cleanupFileCallable(self.statusfile)) fh.write("working pid: {} on node: {}\n".format( os.getpid(), os.uname().nodename)) if DEBUG: print("working pid: {} on node: {}\n".format( os.getpid(), os.uname().nodename)) self._check_and_unpack_new_files() def write_status(self, task, tag, msg=""): '''Write a status file to the remote host. All errors here are ignored''' try: return self._write_status(task, tag, msg) except: traceback.print_exc() def _write_status(self, task, tag, msg=""): ''' Old codes from perl: if ($status_number == 100) {$text = ":Getting ARGOS data from server";} if ($status_number == 200) {$text = ":Finished getting ARGOS-data from server";} if ($status_number == 201) {$text = ":Finished running ${model}";} if ($status_number == 202) {$text = ":Finished extracting ${model} data for ARGOS";} if ($status_number == 401) {$text = ":$run_ident" . "_${model}_input does not exist";} if ($status_number == 402) {$text = ":$run_ident" . "_${model}_iso does not exist";} if ($status_number == 403) {$text = ":$run_ident" . "_${model}_src does not exist";} if ($status_number == 404) {$text = ":Inconsistent isotope identification (isotop-navn)";} if ($status_number == 408) {$text = ":Initial time not covered by NWP database";} if ($status_number == 409) {$text = ":${model} output data do not exist";} my $message = "$status_number" . ":" . "$timestamp" . ":" . "$text"; ''' filename = task.status_filename() work_file = os.path.join(self.directory, self.WORK_DIR, filename) timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M') with open(work_file, 'a+') as fh: if (tag == 'downloading'): fh.write("{x}:{ts}::Getting ARGOS data from server\n".format( x=100, ts=timestamp)) elif (tag == 'success'): fh.write( "{x}:{ts}::Finished extracting {model} data for ARGOS\n". format(x=202, ts=timestamp, model=task.model)) elif (tag == 'error'): fh.write("{x}:{ts}::{model} output data do not exist\n".format( x=409, ts=timestamp, model=task.model)) elif (tag == 'running'): fh.write("101:{ts}::running {model}\n".format( ts=timestamp, model=task.model)) elif (tag == 'internal'): fh.write( "{x}:{ts}::internal error, cannot start job in queue in dir '{rundir}'\n" .format(x=500, ts=timestamp, rundir=task.rundir)) else: fh.write("{tag}:{ts} {msg}\n".format(ts=timestamp, tag=tag, msg=msg)) self.ssh.put_files([work_file], self.remote_dir, 30) def _check_and_unpack_new_files(self): '''Download new files from the remote machine to the upload directory. - Move invalid files to rejected. (Wrong name, not containing *ARGOS2*.zip) - Unpack zip-files in project-folder / delete ignore incomplete files. - status for complete and incomplete files - Remove complete files from remote-upload and local upload - create modelruns throws an exception when download / unpack failed unexpectedly ''' remote_files = os.path.join(self.remote_dir, self.UPLOAD_DIR, '*') local_upload = os.path.join(self.directory, self.UPLOAD_DIR) if not os.path.isdir(local_upload): os.mkdir(local_upload) local_rejected = os.path.join(self.directory, self.REJECTED_DIR) if not os.path.isdir(local_rejected): os.mkdir(local_rejected) try: self.ssh.get_files([remote_files], local_upload, 30) except subprocess.CalledProcessError as cpe: # code 1 is generic error, e.g. no files, 2 is connection error if cpe.returncode != 1: raise cpe delete_in_upload = [] if DEBUG: print("checking files in uploaddir: {}".format(local_upload)) for f in os.listdir(local_upload): if DEBUG: print("found file: {}".format(f)) if os.path.isfile(os.path.join(local_upload, f)): m = re.match(r'([\w\-\.:]*)_ARGOS2(.*)\.zip', f) if m: if DEBUG: print("found zip-file: '{}'".format(f)) task = SnapTask(topdir=self.directory, backupdir=self.directory2, zip_file=f, ident=m.group(1), model=m.group(2), scpdestination=self.scpdestination, scpoptions=" ".join(self.ssh.scp_options)) if task.is_complete(reldir=self.UPLOAD_DIR): if DEBUG: print("handling zipfile: {}".format(f)) if not self.dryrun: if task.handle(self.hpc): self.write_status(task, tag='running') else: self.write_status(task, tag='internal') delete_in_upload.append(f) else: self.write_status(task, tag='downloading') else: os.rename(os.path.join(local_upload, f), os.path.join(local_rejected, f)) delete_in_upload.append(f) delete_upload_files = [ os.path.join(self.UPLOAD_DIR, f) for f in delete_in_upload ] if DEBUG: print("deleting remotely: " + ", ".join(delete_upload_files)) if not self.dryrun: self.ssh.syscall('rm', delete_upload_files, 30)