Esempio n. 1
0
File: job.py Progetto: icui/pyper
def load() -> Node:
    """Create or load main task based on config.toml."""
    from pypers import basedir as d

    if 'job' not in cache:
        if d.has('job.pickle'):
            cache['job'] = d.load('job.pickle')

        elif main := getcfg('job', 'main'):
            module = import_module(main[0])
            cache['job'] = getattr(module, main[1])()

        else:
            raise FileNotFoundError('job.pickle not exists')
Esempio n. 2
0
 def __getitem__(self, key: Union[str, int]):
     if isinstance(key, int):
         return self._nodes[key]
     
     if key in self._dict:
         return self._dict[key]
     
     if key in self._kwargs:
         return self._kwargs[key]
     
     if self.parent:
         return self.parent[key]
     
     if (val := getcfg('workspace', key)) is not None:
         return val
Esempio n. 3
0
 def __contains__(self, key):
     if key in self._dict or key in self._kwargs or getcfg('workspace', key) is not None:
         return True
     
     if self.parent:
         return key in self.parent
     
     return False
Esempio n. 4
0
def maketime(walltime: Optional[Union[float, str]]):
    """Ensure that remaining time is more than walltime."""
    remaining = checktime()

    if isinstance(walltime, str):
        wt = getcfg('walltime', walltime)

        if wt is None:
            console.error(f'warning: walltime `{walltime}` is not defined')

        walltime = wt

    if walltime and walltime >= remaining:
        msg = f'Insufficient execution time ({walltime:.2f}min / {remaining:.2f}min)'

        if hasarg('r') and getcfg('job', 'requeue'):
            raise InsufficientTime(msg)

        else:
            print(msg, file=stderr)
Esempio n. 5
0
def _dispatch(lock: asyncio.Lock, nnodes: int) -> bool:
    """Execute a task if resource is available."""
    ntotal: int = getcfg('job', 'nnodes')

    if nnodes > ntotal:
        raise RuntimeError(f'Insufficient nodes ({nnodes} / {ntotal})')

    if nnodes <= ntotal - sum(_running.values()):
        _running[lock] = nnodes
        return True
    
    return False
Esempio n. 6
0
File: main.py Progetto: icui/pyper
from pypers.core.config import getarg, hasarg, getsys, getcfg
from pypers.core.job import load, add_error
from pypers.core.workflow.directory import Directory

if __name__ == "__main__":
    # execute saved job or MPI task
    if hasarg('mpiexec'):
        try:
            cwd, fid = getarg('mpiexec').split(':')
            func = Directory(cwd).load(f'{fid}.pickle')

            if iscoroutine(result := func()):
                run(result)

        except Exception as e:
            add_error(e)

    else:
        job = load()

        if hasarg('s'):
            job.submit()

        else:
            job.run()

            # requeue if job stopped because of insufficent time
            if not job.done and not job.error and job.exception and hasarg(
                    'r') and getcfg('job', 'requeue'):
                getsys('requeue')()
Esempio n. 7
0
async def mpiexec(d: Directory, cmd: Union[str, Callable],
    nprocs: int, cpus_per_proc: int, gpus_per_proc: int, walltime: Optional[Union[float, str]], resubmit: bool = False):
    """Schedule the execution of MPI task"""
    # task queue controller
    lock = asyncio.Lock()

    # error occurred
    error = None

    # get walltime from config
    if isinstance(walltime, str):
        walltime = cast(Optional[float], getcfg('walltime', walltime))
    
    try:
        # calculate node number
        nnodes = int(ceil(nprocs * cpus_per_proc  / getsys('cpus_per_node')))

        if gpus_per_proc > 0:
            nnodes = max(nnodes, int(ceil(nprocs * gpus_per_proc  / getsys('gpus_per_node'))))

        # wait for node resources
        await lock.acquire()

        if not _dispatch(lock, nnodes):
            _pending[lock] = nnodes
            await lock.acquire()
        
        # make sure remaining time is enough
        if walltime:
            maketime(walltime)

        # save function as pickle to run in parallel
        if callable(cmd):
            funcname = get_name(cmd) + '\n'
            cwd = None
            fid = f'mpiexec.{id(cmd)}'
            d.rm(f'{fid}.*')
            d.dump(cmd, f'{fid}.pickle')
            cmd = f'python -m "pypers.core.main" --mpiexec={d.rel()}:{fid}'
        
        else:
            funcname = ''
            cwd = d.rel()
            fid = 'mpiexec'
        
        # wrap with parallel execution command
        cmd = getsys('mpiexec')(cmd, nprocs, cpus_per_proc, gpus_per_proc)
        
        # create subprocess to execute task
        with open(d.rel(f'{fid}.out'), 'a') as f:
            f.write(f'\n{funcname}{cmd}\n\n')

        with open(d.rel(f'{fid}.out'), 'a') as f:
            process = await asyncio.create_subprocess_shell(cast(str, cmd), cwd=cwd, stdout=f, stderr=f)
            time_start = time()
        
            if walltime and hasarg('r') and getcfg('job', 'requeue'):
                # abort when less than 1 minute remain
                try:
                    await asyncio.wait_for(process.communicate(), max(walltime, checktime() - 1) * 60)
                
                except asyncio.TimeoutError:
                    raise InsufficientTime(f'not enough time for {cmd}')
                
                except Exception as e:
                    raise e

                # save execution time history
                f.write(f'\nwalltime: {walltime}')
            
            else:
                await process.communicate()
            
            f.write(f'\nelapsed: {(time()-time_start)/60:.2f}\n')

        # catch error
        errcls = ResubmitJob if resubmit else RuntimeError

        if fid and d.has(f'{fid}.error'):
            raise errcls(d.read(f'{fid}.error'))

        if process.returncode:
            raise errcls(f'{cmd}\nexit code: {process.returncode}')
    
    except Exception as e:
        error = e
    
    # clear entry
    if lock in _pending:
        del _pending[lock]
    
    if lock in _running:
        del _running[lock]
    
    # sort entries by their node number
    pendings = sorted(_pending.items(), key=lambda item: item[1], reverse=True)

    # execute tasks if resource is available
    for lock, nnodes in pendings:
        if _dispatch(lock, nnodes):
            del _pending[lock]
            lock.release()

    if error:
        raise error
Esempio n. 8
0
def checktime():
    """Get remaining walltime in minutes."""
    return getcfg('job', 'walltime') - (time() - _starttime) / 60
Esempio n. 9
0
 def name(self) -> str:
     """Name of the workspace."""
     return getcfg('job', 'name') if self.rel() == '.' else self.rel().split('/')[-1]