Esempio n. 1
0
 def __init__(self):
     '''
     Constructor
     '''
     connection = SSHConnection(username="******", machine="alvin.nsc.liu.se", port=22)
     queue = SLURMQueue()
     super().__init__(connection, queue)
Esempio n. 2
0
 def __init__(self):
     '''
     Constructor
     '''
     connection = SSHConnection(machine="xvis-m3b.met.no", port=22)
     queue = SGEQueue()
     super().__init__(connection, queue)
Esempio n. 3
0
 def __init__(self):
     '''
     Constructor
     '''
     connection = SSHConnection(username="******",
                                machine="vilje.hpc.ntnu.no",
                                port=22)
     queue = PBSQueue()
     super().__init__(connection, queue)
Esempio n. 4
0
    def __init__(self,
                 directory,
                 hpc,
                 directory2,
                 remote,
                 remoteUser,
                 remoteDir,
                 dryrun=False):
        self.dryrun = dryrun
        self.hpc = HPC.by_name(hpc)
        self.remote = remote
        self.remote_user = remoteUser
        self.remote_dir = remoteDir
        self.ssh = SSHConnection(remoteUser, remote)
        self.scpdestination = "{remote}:{remoteDir}".format(
            remote=self.remote, remoteDir=self.remote_dir)
        if self.remote_user:
            self.scpdestination = self.remote_user + '@' + self.scpdestination

        if dirIsWritable(directory):
            self.directory = directory
            if dirIsWritable(directory2):
                self.directory2 = directory2
            else:
                if (self.dryrun):
                    print("directory2: '{}' not writable and disabled".format(
                        directory2),
                          file=sys.stderr)
                self.directory2 = ""
        elif dirIsWritable(directory2):
            if (self.dryrun):
                print(
                    "directory: '{}' not writable and disabled, using '{}' as default "
                    .format(directory, directory2),
                    file=sys.stderr)
            self.directory = directory2
            self.directory2 = ""
        else:
            raise Exception("{dir1} and {dir2} not writable".format(
                dir1=directory, dir2=directory2))

        workdir = os.path.join(self.directory, self.WORK_DIR)
        if not os.path.isdir(workdir): os.mkdir(workdir)

        self.statusfile = os.path.join(self.directory,
                                       "snapRemoteRunner_working")
        # make sure only one instance is running, not failsafe (no flock on lustre, eventually in different directories, but good enough)
        if (os.path.exists(self.statusfile)):
            file_modified = datetime.datetime.fromtimestamp(
                os.lstat(self.statusfile).st_mtime)
            if (self.dryrun):
                with open(self.statusfile, 'rt') as fh:
                    msg = fh.read()
                print("status-file exists at '{}' with:".format(
                    self.statusfile),
                      file=sys.stderr)
                print(msg, file=sys.stderr)
            else:
                if datetime.datetime.now(
                ) - file_modified > datetime.timedelta(hours=3):
                    # return statusfile if hanging for more than 3 hours
                    print("cleaning up {} after 3 hours".format(
                        self.statusfile),
                          file=sys.stderr)
                    _cleanupFileCallable(self.statusfile)()
                return
        else:
            if not self.dryrun:
                with open(self.statusfile, 'wt') as fh:
                    atexit.register(_cleanupFileCallable(self.statusfile))
                    fh.write("working pid: {} on node: {}\n".format(
                        os.getpid(),
                        os.uname().nodename))
                    if DEBUG:
                        print("working pid: {} on node: {}\n".format(
                            os.getpid(),
                            os.uname().nodename))

        self._check_and_unpack_new_files()
Esempio n. 5
0
class SnapRemoteRunner():
    UPLOAD_DIR = 'upload'
    RUN_DIR = 'runs'
    REJECTED_DIR = 'rejected'
    WORK_DIR = 'work'

    hpc = typed_property("hpc", HPC)
    ssh = typed_property("ssh", SSHConnection)
    directory = typed_property("directory", str)
    directory2 = typed_property("directory2", str)
    dryrun = typed_property("dryrun", bool)
    remote = typed_property("remote", str)
    remote_dir = typed_property("remote_dir", str)
    remote_user = typed_property("remote_user", str)
    statusfile = typed_property("statusfile", str)

    def __init__(self,
                 directory,
                 hpc,
                 directory2,
                 remote,
                 remoteUser,
                 remoteDir,
                 dryrun=False):
        self.dryrun = dryrun
        self.hpc = HPC.by_name(hpc)
        self.remote = remote
        self.remote_user = remoteUser
        self.remote_dir = remoteDir
        self.ssh = SSHConnection(remoteUser, remote)
        self.scpdestination = "{remote}:{remoteDir}".format(
            remote=self.remote, remoteDir=self.remote_dir)
        if self.remote_user:
            self.scpdestination = self.remote_user + '@' + self.scpdestination

        if dirIsWritable(directory):
            self.directory = directory
            if dirIsWritable(directory2):
                self.directory2 = directory2
            else:
                if (self.dryrun):
                    print("directory2: '{}' not writable and disabled".format(
                        directory2),
                          file=sys.stderr)
                self.directory2 = ""
        elif dirIsWritable(directory2):
            if (self.dryrun):
                print(
                    "directory: '{}' not writable and disabled, using '{}' as default "
                    .format(directory, directory2),
                    file=sys.stderr)
            self.directory = directory2
            self.directory2 = ""
        else:
            raise Exception("{dir1} and {dir2} not writable".format(
                dir1=directory, dir2=directory2))

        workdir = os.path.join(self.directory, self.WORK_DIR)
        if not os.path.isdir(workdir): os.mkdir(workdir)

        self.statusfile = os.path.join(self.directory,
                                       "snapRemoteRunner_working")
        # make sure only one instance is running, not failsafe (no flock on lustre, eventually in different directories, but good enough)
        if (os.path.exists(self.statusfile)):
            file_modified = datetime.datetime.fromtimestamp(
                os.lstat(self.statusfile).st_mtime)
            if (self.dryrun):
                with open(self.statusfile, 'rt') as fh:
                    msg = fh.read()
                print("status-file exists at '{}' with:".format(
                    self.statusfile),
                      file=sys.stderr)
                print(msg, file=sys.stderr)
            else:
                if datetime.datetime.now(
                ) - file_modified > datetime.timedelta(hours=3):
                    # return statusfile if hanging for more than 3 hours
                    print("cleaning up {} after 3 hours".format(
                        self.statusfile),
                          file=sys.stderr)
                    _cleanupFileCallable(self.statusfile)()
                return
        else:
            if not self.dryrun:
                with open(self.statusfile, 'wt') as fh:
                    atexit.register(_cleanupFileCallable(self.statusfile))
                    fh.write("working pid: {} on node: {}\n".format(
                        os.getpid(),
                        os.uname().nodename))
                    if DEBUG:
                        print("working pid: {} on node: {}\n".format(
                            os.getpid(),
                            os.uname().nodename))

        self._check_and_unpack_new_files()

    def write_status(self, task, tag, msg=""):
        '''Write a status file to the remote host. All errors here are ignored'''
        try:
            return self._write_status(task, tag, msg)
        except:
            traceback.print_exc()

    def _write_status(self, task, tag, msg=""):
        '''
    Old codes from perl:
    if ($status_number == 100) {$text = ":Getting ARGOS data from server";}
    if ($status_number == 200) {$text = ":Finished getting ARGOS-data from server";}
    if ($status_number == 201) {$text = ":Finished running ${model}";}
    if ($status_number == 202) {$text = ":Finished extracting ${model} data for ARGOS";}
    if ($status_number == 401) {$text = ":$run_ident" . "_${model}_input does not exist";}
    if ($status_number == 402) {$text = ":$run_ident" . "_${model}_iso does not exist";}
    if ($status_number == 403) {$text = ":$run_ident" . "_${model}_src does not exist";}
    if ($status_number == 404) {$text = ":Inconsistent isotope identification (isotop-navn)";}
    if ($status_number == 408) {$text = ":Initial time not covered by NWP database";}
    if ($status_number == 409) {$text = ":${model} output data do not exist";}
    my $message = "$status_number" . ":" . "$timestamp" . ":" . "$text";
'''
        filename = task.status_filename()
        work_file = os.path.join(self.directory, self.WORK_DIR, filename)
        timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M')
        with open(work_file, 'a+') as fh:
            if (tag == 'downloading'):
                fh.write("{x}:{ts}::Getting ARGOS data from server\n".format(
                    x=100, ts=timestamp))
            elif (tag == 'success'):
                fh.write(
                    "{x}:{ts}::Finished extracting {model} data for ARGOS\n".
                    format(x=202, ts=timestamp, model=task.model))
            elif (tag == 'error'):
                fh.write("{x}:{ts}::{model} output data do not exist\n".format(
                    x=409, ts=timestamp, model=task.model))
            elif (tag == 'running'):
                fh.write("101:{ts}::running {model}\n".format(
                    ts=timestamp, model=task.model))
            elif (tag == 'internal'):
                fh.write(
                    "{x}:{ts}::internal error, cannot start job in queue in dir '{rundir}'\n"
                    .format(x=500, ts=timestamp, rundir=task.rundir))
            else:
                fh.write("{tag}:{ts} {msg}\n".format(ts=timestamp,
                                                     tag=tag,
                                                     msg=msg))
        self.ssh.put_files([work_file], self.remote_dir, 30)

    def _check_and_unpack_new_files(self):
        '''Download new files from the remote machine to the upload directory.
        - Move invalid files to rejected. (Wrong name, not containing *ARGOS2*.zip)
        - Unpack zip-files in project-folder / delete ignore incomplete files.
            - status for complete and incomplete files
        - Remove complete files from remote-upload and local upload 
        - create modelruns
        
        throws an exception when download / unpack failed unexpectedly
        '''
        remote_files = os.path.join(self.remote_dir, self.UPLOAD_DIR, '*')
        local_upload = os.path.join(self.directory, self.UPLOAD_DIR)
        if not os.path.isdir(local_upload):
            os.mkdir(local_upload)
        local_rejected = os.path.join(self.directory, self.REJECTED_DIR)
        if not os.path.isdir(local_rejected):
            os.mkdir(local_rejected)
        try:
            self.ssh.get_files([remote_files], local_upload, 30)
        except subprocess.CalledProcessError as cpe:
            # code 1 is generic error, e.g. no files, 2 is connection error
            if cpe.returncode != 1: raise cpe

        delete_in_upload = []
        if DEBUG: print("checking files in uploaddir: {}".format(local_upload))
        for f in os.listdir(local_upload):
            if DEBUG: print("found file: {}".format(f))
            if os.path.isfile(os.path.join(local_upload, f)):
                m = re.match(r'([\w\-\.:]*)_ARGOS2(.*)\.zip', f)
                if m:
                    if DEBUG: print("found zip-file: '{}'".format(f))
                    task = SnapTask(topdir=self.directory,
                                    backupdir=self.directory2,
                                    zip_file=f,
                                    ident=m.group(1),
                                    model=m.group(2),
                                    scpdestination=self.scpdestination,
                                    scpoptions=" ".join(self.ssh.scp_options))
                    if task.is_complete(reldir=self.UPLOAD_DIR):
                        if DEBUG: print("handling zipfile: {}".format(f))
                        if not self.dryrun:
                            if task.handle(self.hpc):
                                self.write_status(task, tag='running')
                            else:
                                self.write_status(task, tag='internal')
                        delete_in_upload.append(f)
                    else:
                        self.write_status(task, tag='downloading')
                else:
                    os.rename(os.path.join(local_upload, f),
                              os.path.join(local_rejected, f))
                    delete_in_upload.append(f)

        delete_upload_files = [
            os.path.join(self.UPLOAD_DIR, f) for f in delete_in_upload
        ]
        if DEBUG: print("deleting remotely: " + ", ".join(delete_upload_files))
        if not self.dryrun:
            self.ssh.syscall('rm', delete_upload_files, 30)