Beispiel #1
0
def WriteSimulationInfo(Ddata,
                        mutAges,
                        mutPops,
                        mutFreqs,
                        nreplicas,
                        suffix,
                        inputParamsFiles,
                        pop2name=pop2name,
                        powerSfx='',
                        getio=None):
    """Write out info files describing what is in the simulation.  These files are used by sim_analysis_pipe.pl
	to define the simulation analysis pipeline.
	"""

    stdDir = '../Data/Shari_Data/sim/stdSimAnalConfig'

    configFiles = dict([(stdDir + '/' + test + '_config.txt',
                         Str("$Ddata/power_$test$suffix/config$powerSfx.txt"))
                        for test in ('lrh', 'ihs', 'xpop')])

    cfgDir = Ddata + '/config' + suffix

    if getio:
        return dict(depends_on=list(configFiles.keys()) +
                    list(inputParamsFiles),
                    creates=[
                        cfgDir + '/' + f + powerSfx + '.txt'
                        for f in ('scenarios', 'sims', 'pops')
                    ] + list(configFiles.values()))

    neutralParams = reduce(concat, list(map(SlurpFile, inputParamsFiles)), '')

    # Check that the list of pops defined in the param file matches the list of pops we are analyzing
    assert sorted( [ int( s.split()[1] ) for s in neutralParams.split( '\n' ) if s.startswith( 'pop_define' ) ] ) \
        == sorted( mutPops )

    assert set(map(int, list(pop2name.keys()))) == set(mutPops)

    DumpFile(
        cfgDir + '/scenarios%s.txt' % powerSfx,
        '\n'.join(scen.scenDir()
                  for scen in GetScenarios(mutAges, mutPops, mutFreqs)))
    DumpFile(cfgDir + '/sims%s.txt' % powerSfx, '%d\n%d' % (0, nreplicas - 1))
    DumpFile(
        cfgDir + '/pops%s.txt' % powerSfx,
        '\n'.join('%s\t%d' % (popName, popNum)
                  for popNum, popName in list(pop2name.items())))
    for fromFile, toFile in list(configFiles.items()):
        copyfile(fromFile, toFile)
Beispiel #2
0
def doSplit(splitFunc, splitFN, outDir, getio=None):
    """Do the splitting"""

    chunkListFN = os.path.join(outDir, 'chunks.txt')

    if getio: return dict(depends_on=splitFN, creates=chunkListFN)

    chunkFNs = splitFunc(splitFN, outDir=outDir)
    DumpFile(chunkListFN, '\n'.join(chunkFNs))
Beispiel #3
0
def CreateSimsParams_neutral(Ddata, suffix, inputParamsFiles, getio=None):
    """Write the neutral parameter file.
	"""

    inputParamsFiles = MakeSeq(inputParamsFiles)

    neutralParamsFile = Ddata + '/params_neutral' + suffix

    if getio:
        return dict(depends_on=inputParamsFiles, creates=neutralParamsFile)

    neutralParams = reduce(concat, map(SlurpFile, inputParamsFiles))

    DumpFile(neutralParamsFile, neutralParams)
Beispiel #4
0
def checkTableKey(inFN,
                  cols,
                  comparison='lt',
                  writeCheckedFile=True,
                  tsvOpts={},
                  lineFilter=None,
                  lineFilterCols=(),
                  getio=None):
    """Check that in the given table, record identifiers increase uniformly.

  Params:

     cols - the columns whose tuple should uniformly inrease
     comparison - this comparison must be true between each record and the next.
       the comparison is the name of a routine in the operator module.
  """

    cols = tuple(MakeSeq(cols))
    lineFilterCols = tuple(MakeSeq(lineFilterCols))
    checkedFN = Str('$inFN.checked_${comparison}') + Sfx(*cols)
    if getio:
        return dict(depends_on=inFN,
                    creates=checkedFN if writeCheckedFile else (),
                    attrs=dict(piperun_short=True))

    comparisonFunc = getattr(operator, comparison)
    prevRec = None
    loadCols = cols + lineFilterCols

    nskipped = 0
    nchecked = 0
    for i, r in enumerate(IDotData(inFN, ToLoad=loadCols, **tsvOpts)):
        if lineFilter and not lineFilter(r):
            nskipped += 1
            continue

        thisRec = r[cols] if IsSeq(r) else (r, )
        if i > 0 and not comparisonFunc(prevRec, thisRec):
            logging.error(
                Str('at line $i of $inFN, looking at $cols: $prevRec is not $comparison $thisRec'
                    ))
            assert False
        else:
            nchecked += 1
        prevRec = thisRec

    dbg('nchecked nskipped')
    DumpFile(checkedFN, 'checked ok.')
Beispiel #5
0
def RunTasks( options ):
    """Take tasks from the specified queue directory, and run them.

    Params:

       options - see command-line parameter definition in main() below.
       
    """

    if haveParamiko: Random.atfork()

    startClock = time.time()

    logging.info( 'Starting runner (process id %d on host %s) with options %s'
                  % ( os.getpid(), GetHostName(), options ) )

    stopSignal = [ False ]

    def SetStopSignal( sigNum, stkFrm ):
        logging.info( 'Setting stop signal to stop runners' )
        stopSignal[ 0 ] = True
        dbg( '"aftset" stopSignal' )

    signal.signal( signal.SIGUSR1, SetStopSignal )

    fs = RemoteFileSystem( remote = options.remote, pw = options.password, pkey = options.pkey ) \
        if options.remote else LocalFileSystem()

    # check that all queues exist
    assert all( fs.exists( queue ) and fs.isdir( queue )
                for queue in options.queues.split( fs.pathsep ) )

    # register a cleanup routine so that, if we claim a task and then
    # crash midway through it, our lock on the task is erased, so that
    # another runner can pick up the task.  Note that if the _task_ fails
    # with an error code, that's fine -- we just report the error code
    # to the mqsub.sh script instance that submitted the task.
    # The cleanup here happens only if the runner crashes before receiving
    # a proper exit code from the task.
    fileToErase = [ None ]
    @atexit.register
    def DoErase( eraseWhat = fileToErase ):
        if eraseWhat[0] and fs.exists(eraseWhat[0]):
            #print 'runner AtExit: removing ' + eraseWhat[0]
            fs.remove( eraseWhat[0] )

    # var: lastFinish - time when a task last finished.
    lastFinish = time.time()

    queues = options.queues.split( fs.pathsep )
    lastQueueModTime = [ None ] * len( queues )

    skipDirs = set([ 'newtask.dat' ] )

    numTasksRun = 0
    numProcsAvail = int( os.getenv( 'LSB_DJOB_NUMPROC', 1 ) )
    dbg( 'numProcsAvail' )

    for queue in queues:
        EnsureDirExists( os.path.join( queue, 'succ' ) )
        EnsureDirExists( os.path.join( queue, 'fail' ) )
    
    while not stopSignal[0]:

        ranCommand = False

        if options.maxRunHours > 0 and ( time.time() - startClock  ) / 3600.0 > options.maxRunHours:
            logging.info( 'Runner exiting after CPU time of %s hours' % ( time.time() - startClock ) / 3600.0 )
            return

        if stopSignal[ 0 ]:
            logging.info( 'Runner stopped by stop signal' )
            return
        else: dbg( '"chkstop" stopSignal' )

        dbg( 'queues' )
        for queueNum, queue in enumerate( queues ):
            dbg( 'queueNum queue' )

            # do a quick check to see if any tasks have been added to the queue since we last checked
            newTaskFN = os.path.join( queue, 'newtask.dat' )
            try:
                curQueueModTime = fs.stat( newTaskFN ).st_mtime
                if curQueueModTime == lastQueueModTime[ queueNum ]: continue
                lastQueueModTime[ queueNum ] = curQueueModTime
            except EnvironmentError as e:
                if os.path.exists( newTaskFN ):
                    logging.warning( 'ERROR CHECKING FOR NEW TASKS in queue %s: %s' % ( queue, e ) )
                pass
            
            # find an unclaimed task in this queue, and try to claim it
            taskDirs = sorted( fs.listdir( queue ) )
            dbg( 'len(taskDirs)' )
            #random.shuffle( taskDirs )
            dbg( 'os.environ.get("MQ_FIRST_DIR")' )
            if 'MQ_FIRST_DIR' in os.environ and os.environ[ 'MQ_FIRST_DIR' ] in taskDirs:
            
                taskDirs = [ os.environ[ 'MQ_FIRST_DIR' ] ] + taskDirs
                logging.info( 'putting specified dir first' )

            for taskDir in taskDirs:

                if taskDir in skipDirs: continue

                if options.maxRunHours > 0 and ( ( time.time() - startClock ) / 3600.0 ) > options.maxRunHours:
                    logging.info( 'Runner exiting after CPU time of %s hours' % ( ( time.time() - startClock ) / 3600.0 ) )
                    return

                if stopSignal[ 0 ]:
                    logging.info( 'Runner stopped by stop signal' )
                    return
                else: dbg( '"chkstop" stopSignal' )

                try:

                    while fs.path.exists( os.path.join( queue, 'noclaim.dat' ) ):
                        time.sleep( 60 + random.normalvariate( 10.0, 5.0 ) ) 
                
                    fullTaskDir = fs.path.join( queue, taskDir )
                    claimedFN = fs.path.join( fullTaskDir, options.claimedFN )

                    attrsFN = fs.path.join( fullTaskDir, 'attrs.tsv' )
                    cwdFN = fs.path.join( fullTaskDir, 'submitdir.txt' )

                    failedCond = []
                    
                    def saveVal( name, val, fc = failedCond ):
                        if not val: fc.append( name )
                        return val

                    if saveVal( 'ready', fs.path.exists( fs.path.join( fullTaskDir, options.readyFN ) ) ) \
                            and saveVal( 'not claimed', not fs.path.exists( claimedFN ) ) \
                            and saveVal( 'relocatable', ( not options.remote or \
                                                              all([ not f.startswith( '/' ) for which in ( 'sources', 'targets' )  \
                                          for f in fs.SlurpFile( fs.path.join( fullTaskDir, which + '.lst' ) ) ]) ) ) \
                            and saveVal( 'memOk', GetMemReq( fs, attrsFN ) <= options.maxMem ) \
                            and saveVal( 'minMemOk', options.minMem == 0 or GetMemReq( fs, attrsFN ) >= options.minMem ) \
                            and saveVal( 'minProc', GetProcReq( fs, attrsFN ) >= options.minProc ) \
                            and saveVal( 'maxProc', GetProcReq( fs, attrsFN ) <= numProcsAvail ) \
                            and saveVal( 'local', ( options.local_tasks or not GetTaskAttr( fs, attrsFN, 'piperun_run_locally', False ) ) ) \
                            and saveVal( 'onlyLocal',
                                         ( not options.only_local_tasks or GetTaskAttr( fs, attrsFN, 'piperun_run_locally', False ) ) ) \
                            and saveVal( 'short', ( not options.runOnlyShort or GetTaskAttr( fs, attrsFN, 'piperun_short', False ) ) )  \
                            and saveVal( 'long', ( not options.runOnlyLong or not GetTaskAttr( fs, attrsFN, 'piperun_short', False ) ) )  \
                            and saveVal( 'notRequeued', ( not options.noRequeuedTasks or not fs.path.exists( fs.path.join( fullTaskDir, 'requeued.dat' ) ) ) ) \
                            and saveVal( 'notFromHost', ( not options.onlyFromHost or socket.getfqdn() == options.onlyFromHost ) ) \
                            and saveVal( 'notFromPipeline', ( not options.onlyFromPipelineId or  \
                                                                  GetTaskAttr( fs, attrsFN, 'piperun_pipelineId' ) == options.onlyFromPipelineId ) ):

                        # try to claim the task
                        try:
                            fd = fs.open(claimedFN, os.O_CREAT|os.O_EXCL|os.O_WRONLY)
                        except EnvironmentError:
                            # another runner beat us to this task -- go and check other tasks
                            logging.info( 'another job beat us to claiming ' + fullTaskDir )
                            continue

                        try:
                            fs.write( fd, 'locked by process %d on host %s\n' % ( os.getpid(), GetHostName() ) )
                            for v in list(os.environ.keys()):
                                fs.write( fd, '%s=%s\n' % ( v, os.environ[v] ) )
                        finally:
                            fs.close( fd )
                        # Tell our cleanup code to release this task if we crash.
                        fileToErase[0] = claimedFN
                        # get the command to run the task
                        theCMD = fs.SlurpFile( os.path.join( fullTaskDir, 'command.dat' ) ).strip()
                        theCmdDir = fs.SlurpFile( os.path.join( fullTaskDir, 'submitdir.txt' ) ).strip()
                        theCmdEnvFN = os.path.join( fullTaskDir, 'submitenv.txt' )

                        if options.remote:
                            assert have_fcntl
                            SystemSucceed( 'mkdir -p ' + os.path.join( options.localDataDir, fs.root[1:] ) )
                            for needDir in 'Operations', 'Classes', 'System', 'Other':
                                needDirFull = os.path.join( options.localDataDir, fs.root[1:], '..', needDir )
                                if not os.path.exists( needDirFull ):
                                    os.symlink( os.path.realpath( os.path.join( '..', needDir ) ), needDirFull )

                            # copy source files

                            # get exclusive locks on the source files
                            srcFiles = sorted( set( fs.SlurpFile( os.path.join( fs.root, fullTaskDir, 'sources.lst' ) ).rstrip( '\n' ).split( '\n' ) ) )
                            srcLockIds = []
                            srcLockFiles = []
                            for srcFile in srcFiles:
                                lockFile = os.path.join( options.localDataDir, 'mqlocks', srcFile[1:] )
                                if lockFile.endswith('/'): lockFile = lockFile[:-1]
                                lockFile += '.lock'
                                SystemSucceed( 'mkdir -p ' + os.path.dirname( lockFile ) )
                                gotLock = False
                                while not gotLock:
                                    try:
                                        openMode = os.O_CREAT|os.O_EXCL|os.O_WRONLY
                                        logging.info( 'opening ' + lockFile + ' with mode ' + str( openMode ) )
                                        lockId = os.open( lockFile, openMode )
                                        gotLock = True
                                    except EnvironmentError:
                                        logging.info( 'Could not create ' + lockFile + ' , waiting...' )
                                        time.sleep( 10 + random.normalvariate( 3.0, 1.0 ) )
                                fcntl.lockf( lockId, fcntl.LOCK_EX )
                                srcLockIds.append( lockId )
                                srcLockFiles.append( lockFile )
                                logging.info( 'Got lock on ' + lockFile )

                            SystemSucceed( 'rsync -zprv --files-from=:' + os.path.join( fs.root, fullTaskDir, 'sources.lst' ) +
                                           ' ' + fs.username + '@' + fs.hostname + ':/ ' + options.localDataDir )

                            for srcLockId, srcLockFile in zip( srcLockIds, srcLockFiles)[::-1]:
                                fcntl.lockf( srcLockId, fcntl.LOCK_UN )
                                os.close( srcLockId )
                                SystemSucceed( 'rm -rf ' + srcLockFile )

                            targets = fs.SlurpFile( os.path.join( fs.root, fullTaskDir, 'targets.lst' ) ).rstrip( '\n' ).split('\n')
                            targetDirs = set( map( os.path.dirname, [_f for _f in map( str.strip, targets ) if _f] ) )
                            dbg( '"DDDDDD" targetDirs' )

                            for targetDir in targetDirs:
                                assert targetDir.startswith( '/' )
                                tdir = os.path.join( options.localDataDir, targetDir[1:] )
                                SystemSucceed( 'mkdir -p ' + tdir + ' ' + os.path.join( tdir, 'makeinfo' ) )

                            theCMD = 'cd ' + os.path.join( options.localDataDir, fs.root[1:] ) + ' && ' + theCMD

                        logging.info( 'Under ' + claimedFN + ' RUNNING: ' + theCMD )
                        # Actually run the task; get its exit code
                        save_cwd = os.getcwd()
                        try:
                            os.chdir( theCmdDir )
                            logging.info( 'CWD=' + os.getcwd() )

                            runScriptFN = os.path.join( fullTaskDir, 'run.sh' )

                            with open( runScriptFN, 'w' ) as out:
                                out.write( '#!/usr/bin/env bash\n' )
                                out.write( 'set -e -o pipefail\n' )
                                with open( theCmdEnvFN ) as envFile:
                                    for line in envFile:
                                        if '=' not in line or line.startswith('module='): break
                                        equalIdx = line.index( '=' )
                                        envVarName = line[ :equalIdx+1 ]
                                        if not ( re.search( r'\W', envVarName ) or envVarName.startswith( 'LSB_' ) or \
                                           envVarName.startswith( 'LSF_' ) or \
                                           envVarName.startswith( 'LS_' ) or envVarName.startswith( 'SLURM' ) or \
                                           envVarName in \
                                           ( 'SYS_TYPE', 'MACHTYPE', 'VENDOR', 'OSTYPE',
                                             'DOMAINNAME', 'HOSTTYPE', 'SHORTHOST', 'SSH_TTY',
                                             'HOST', 'HOSTNAME', 'REMOTEHOST', 'STY' ) ):
                                            out.write( 'export ' + envVarName + "'" + line[ equalIdx+1: -1 ] + "'\n" )
                                out.write( theCMD )

                            os.chmod( runScriptFN, stat.S_IXUSR | stat.S_IRWXU )

                            try:
                                exitCode = os.system( runScriptFN )
                            except ( KeyboardInterrupt, SystemExit ):
                                interruptedFN = os.path.join( fullTaskDir, 'interrupted.dat' )
                                DumpFile( interruptedFN, 'interrupted' );
                                raise
                        finally:
                            os.chdir( save_cwd )
                        logging.info( 'Under ' + claimedFN + ' FINISHED RUNNING: ' + theCMD )
                        logging.info( 'Got exit code %d' % exitCode )

                        if options.remote:
                            # copy the target files and the output log back to the correct dirs on the remote system

                            # first, make sure the files all exist, and are no longer being written to.

                            time.sleep( options.aftTaskDelay )

                            os.system( 'rsync -zprv --files-from=:' + os.path.join( fs.root, fullTaskDir, 'targets.lst' ) +
                                       ' ' + options.localDataDir + ' ' + fs.username + '@' + fs.hostname + ':/' )

                        # If we succeeded in running the task (whether the task itself failed or not),
                        # tell the cleanup code to NOT release this task if we crash.
                        fileToErase[0] = None
                        # Tell the task submitter script that we are done, and what the task's
                        # exit code was.

                        if os.path.exists( os.path.join( fullTaskDir, 'nmq.dat' ) ):

                            time.sleep(3)
                            fd = fs.open( os.path.join( fullTaskDir, 'completed.dat' ), os.O_CREAT|os.O_EXCL|os.O_WRONLY )
                            fs.close(fd)
                            
                            try:
                                shutil.move( fullTaskDir, os.path.join( queue, 'succ' if exitCode == 0 else 'fail' ) )
                            except EnvironmentError as e:
                                logging.warning( 'Error moving ' + fullTaskDir + ' to ' + os.path.join( queue, 'succ' if exitCode == 0 else 'fail' ) + ' : ' + e )
                        else:
                            exitCodeFN = os.path.join( fullTaskDir, 'exitCode.dat' )
                            fd = fs.open( exitCodeFN, os.O_CREAT|os.O_EXCL|os.O_WRONLY )
                            bytesWritten = fs.write( fd, str( exitCode ) )
                            fs.close( fd )

                            time.sleep(3)
                            logging.info( 'Wrote exit code %s to file %s (%s bytes)' % ( exitCode, exitCodeFN, bytesWritten ) )

                            fd = fs.open( os.path.join( fullTaskDir, 'completed.dat' ), os.O_CREAT|os.O_EXCL|os.O_WRONLY )
                            fs.close(fd)

                        # Record that we actually ran a task here.
                        ranCommand = True
                        lastFinish = time.time()
                        numTasksRun += 1

                    else:
                        logging.info( 'did not take task ' + taskDir + ' ; reason: ' + str( failedCond ) );

                except:
                    excInfo = sys.exc_info()
                    logging.warning( 'Error trying to grab task from ' + taskDir + ' (%s), skipping...'
                                     % str( excInfo ) )
                    traceback.print_exc()

        dbg( 'ranCommand lastFinish time.time()-lastFinish' )
        if not ranCommand:
            waitTimeHere = time.time() - lastFinish
            if ( numTasksRun > 0 and options.maxWaitTime > 0 and waitTimeHere > options.maxWaitTime ) \
                    or ( numTasksRun == 0 and options.maxFirstWaitTime > 0 and waitTimeHere > options.maxFirstWaitTime ) :
                logging.info( 'Runner exiting after idle time of %s' % waitTimeHere )
                return
            time.sleep( options.taskCheckInterval + random.normalvariate( 3.0, 1.0 ) )