Esempio n. 1
0
def parasolRestart():
    """Function starts the parasol hub and node.
    """
    parasolStop()
    while True:
        machineList = os.path.join(workflowRootPath(), "jobTree", "machineList")
        #pathEnvVar = os.environ["PATH"]
        os.system("paraNode start -hub=localhost") 
        #-umask=002 -userPath=%s -sysPath=%s" % (pathEnvVar, pathEnvVar))
        os.system("paraHub %s subnet=127.0.0 &" % (machineList,))
        tempFile = getTempFile()
        dead = True
        try:
            popen("parasol status", tempFile)
            fileHandle = open(tempFile, 'r')
            line = fileHandle.readline()
            while line != '':
                if "Nodes dead" in line:
                    print line
                    if int(line.split()[-1]) == 0:
                        dead = False
                line = fileHandle.readline()
            fileHandle.close()
        except RuntimeError:
            pass
        os.remove(tempFile)
        if not dead:
            break
        else:
            logger.info("Tried to restart the parasol process, but failed, will try again")
            parasolStop()
            time.sleep(5)
    logger.info("Restarted the parasol process")
Esempio n. 2
0
def find_analyses(target, recordsToAnalyze, templateFastqFiles, complementFastqFiles, references, outputDir):
    """takes a set of records to analyze and finds the corresponding sequences and creates alignment targets"""
    files = {"template":[], "complement":[]}

    logger.info("Finding template analyses")
    for fastqFile in templateFastqFiles:
        for name, seq, qual in fastqRead(fastqFile):
            if name in recordsToAnalyze:
                outfile = os.path.join(target.getGlobalTempDir(), "template_" + name)
                files["template"].append(outfile)
                ref_name, ref_start, ref_stop = recordsToAnalyze[name]
                ref_seq = references[ref_name][ref_start : ref_stop]
                analysis = [name, seq, ref_name, ref_seq, outfile]
                target.addChildTarget(Target.makeTargetFn(analyze, args=analysis))

    logger.info("Finding complement analyses")
    for fastqFile in complementFastqFiles:
        for name, seq, qual in fastqRead(fastqFile):
            if name in recordsToAnalyze:
                outfile = os.path.join(target.getGlobalTempDir(), "complement_" + name)
                files["complement"].append(outfile)
                ref_name, ref_start, ref_stop = recordsToAnalyze[name]
                ref_seq = references[ref_name][ref_start : ref_stop]
                analysis = [name, seq, ref_name, ref_seq, outfile]
                target.addChildTarget(Target.makeTargetFn(analyze, args=analysis))

    target.setFollowOnTargetFn(merge, args=(files, outputDir))
Esempio n. 3
0
def main():
    parser = build_parser()
    Stack.addJobTreeOptions(parser)
    args = parser.parse_args()
    setLoggingFromOptions(args)

    if not os.path.exists(args.outDir):
        os.mkdir(args.outDir)

    if args.overwriteDb is True:
        if os.path.exists(args.mergedDb):
            os.remove(args.mergedDb)
        for g in args.genomes:
            if os.path.exists(os.path.join(args.outDir, g + ".db")):
                os.remove(os.path.join(args.outDir, g + ".db"))

    logger.info("Building paths to the required files")
    alnPslDict = parse_dir(args.genomes, args.dataDir, alignment_ext)
    seqTwoBitDict = parse_dir(args.genomes, args.dataDir, sequence_ext)
    geneCheckBedDict = parse_dir(args.genomes, args.dataDir, gene_check_ext)
    #geneCheckBedDetailsDict = parse_dir(args.genomes, args.geneCheckDir, gene_check_details_ext)

    refSequence = os.path.join(args.dataDir, args.refGenome + ".2bit")
    if not os.path.exists(refSequence):
        raise RuntimeError("Reference genome 2bit not present at {}".format(refSequence))
    args.refSequence = refSequence

    i = Stack(Target.makeTargetFn(build_analysis, args=(alnPslDict, seqTwoBitDict, geneCheckBedDict, 
            args.gencodeAttributeMap, args.genomes, args.annotationBed, args.outDir, args.primaryKey, 
            args.refGenome))).startJobTree(args)

    if i != 0:
        raise RuntimeError("Got failed jobs")

    merge_databases(args.outDir, args.mergedDb, args.genomes)
Esempio n. 4
0
def find_analyses(target, recordsToAnalyze, templateFastqFiles,
                  complementFastqFiles, references, outputDir):
    """takes a set of records to analyze and finds the corresponding sequences and creates alignment targets"""
    files = {"template": [], "complement": []}

    logger.info("Finding template analyses")
    for fastqFile in templateFastqFiles:
        for name, seq, qual in fastqRead(fastqFile):
            if name in recordsToAnalyze:
                outfile = os.path.join(target.getGlobalTempDir(),
                                       "template_" + name)
                files["template"].append(outfile)
                ref_name, ref_start, ref_stop = recordsToAnalyze[name]
                ref_seq = references[ref_name][ref_start:ref_stop]
                analysis = [name, seq, ref_name, ref_seq, outfile]
                target.addChildTarget(
                    Target.makeTargetFn(analyze, args=analysis))

    logger.info("Finding complement analyses")
    for fastqFile in complementFastqFiles:
        for name, seq, qual in fastqRead(fastqFile):
            if name in recordsToAnalyze:
                outfile = os.path.join(target.getGlobalTempDir(),
                                       "complement_" + name)
                files["complement"].append(outfile)
                ref_name, ref_start, ref_stop = recordsToAnalyze[name]
                ref_seq = references[ref_name][ref_start:ref_stop]
                analysis = [name, seq, ref_name, ref_seq, outfile]
                target.addChildTarget(
                    Target.makeTargetFn(analyze, args=analysis))

    target.setFollowOnTargetFn(merge, args=(files, outputDir))
Esempio n. 5
0
 def issueJobs(self, jobCommands):
     """Issues parasol with job commands.
     """
     issuedJobs = {}
     for jobCommand, memory, cpu, logFile in jobCommands:
         assert memory != None
         assert cpu != None
         assert logFile != None
         pattern = re.compile("your job ([0-9]+).*")
         command = "parasol -verbose -ram=%i -cpu=%i -results=%s add job '%s'" % (memory, cpu, self.parasolResultsFile, jobCommand)
         while True:
             #time.sleep(0.1) #Sleep to let parasol catch up #Apparently unnecessary
             popenParasolCommand(command, self.scratchFile)
             fileHandle = open(self.scratchFile, 'r')
             line = fileHandle.readline()
             fileHandle.close()
             match = pattern.match(line)
             if match != None: #This is because parasol add job will return success, even if the job was not properly issued!
                 break
             else:
                 logger.info("We failed to properly add the job, we will try again after a sleep")
                 time.sleep(5)
         jobID = int(match.group(1))
         logger.debug("Got the job id: %s from line: %s" % (jobID, line))
         assert jobID not in issuedJobs.keys()
         issuedJobs[jobID] = jobCommand
         logger.debug("Issued the job command: %s with job id: %i " % (command, jobID))
     return issuedJobs
Esempio n. 6
0
 def run(self):
     ##########################################
     #Setup a file tree.
     ##########################################
         
     tempFileTree = TempFileTree(os.path.join(self.getGlobalTempDir(), getRandomAlphaNumericString()))   
     
     fileTreeRootFile = tempFileTree.getTempFile()
 
     makeFileTree(fileTreeRootFile, \
                  self.depth, tempFileTree)
     
     treePointer = tempFileTree.getTempFile()
     
     makeTreePointer(fileTreeRootFile, treePointer)
     
     logger.info("We've set up the file tree")
     
     ##########################################
     #Issue the child and follow on jobs
     ##########################################
     
     self.addChildTarget(ChildTarget(treePointer))
     
     self.setFollowOnTarget(DestructFileTree(tempFileTree))
     
     logger.info("We've added the child target and finished SetupFileTree.run()")
def get_paired_fastqs(target, genome, institute, tissue, bam, reference, out_dir, name_sorted_sam_path, fwd_fastq_path,
                      rev_fastq_path):
    logger.info("Extracting paired fastqs")
    target.addChildTargetFn(get_fwd, args=(name_sorted_sam_path, fwd_fastq_path))
    target.addChildTargetFn(get_rev, args=(name_sorted_sam_path, rev_fastq_path))
    target.setFollowOnTargetFn(kallisto_paired, args=(genome, institute, tissue, bam, reference, out_dir, 
                                                      name_sorted_sam_path, fwd_fastq_path, rev_fastq_path))
Esempio n. 8
0
def parseJobFile(absFileName):
    try:
        job = ET.parse(absFileName).getroot()
        return job
    except IOError:
        logger.info("Encountered error while parsing job file %s, so we will ignore it" % absFileName)
    return None
Esempio n. 9
0
def loadEnvironment(config):
    """Puts the environment in the pickle file.
    """
    #Dump out the environment of this process in the environment pickle file.
    fileHandle = open(config.attrib["environment_file"], 'w')
    cPickle.dump(os.environ, fileHandle)
    fileHandle.close()
    logger.info("Written the environment for the jobs to the environment file")
Esempio n. 10
0
def setupTempFileTrees(config):
    """Load the temp file trees
    """
    config.attrib["job_file_dir"] = TempFileTree(config.attrib["job_file_dir"])
    config.attrib["temp_dir_dir"] = TempFileTree(config.attrib["temp_dir_dir"])
    config.attrib["log_file_dir"] = TempFileTree(config.attrib["log_file_dir"])
    config.attrib["slave_log_file_dir"] = TempFileTree(config.attrib["slave_log_file_dir"])
    logger.info("Setup the temp file trees")
Esempio n. 11
0
 def run (self):
     parasolRestart()
     while True:
         time.sleep(random.choice(xrange(240)))
         if self.kill == True:
             return
         logger.info("Going to kill a parasol/master process")
         killMasterAndParasol()
Esempio n. 12
0
 def emHasTerminated(self):
     if self.iteration < 2:
         return False
     prevLL = self.readLL("params%i.txt" % (self.iteration - 1))
     currLL = self.readLL("params%i.txt" % (self.iteration))
     decrease = ((prevLL - currLL) / currLL)
     logger.info("LL: %5g, Decrease: %3g" % (currLL, 100 * decrease))
     return decrease < self.tolerance
Esempio n. 13
0
 def emHasTerminated(self):
     if self.iteration < 2:
         return False
     prevLL = self.readLL("params%i.txt" % (self.iteration - 1))
     currLL = self.readLL("params%i.txt" % (self.iteration))
     decrease = ((prevLL - currLL) / currLL)
     logger.info("LL: %5g, Decrease: %3g" % (currLL, 100*decrease))
     return decrease < self.tolerance
def buildAnalyses(target, queries, baseOutDir, bpPenalty, dataPenalty,
                  tightness, keyFile, graph, kmerSize, saveInter):
    logger.info("Starting to build analyses")
    for uuid, queryString in queries.iteritems():
        target.addChildTarget(
            SlicerModelWrapper(uuid, queryString, baseOutDir, bpPenalty,
                               dataPenalty, tightness, keyFile, graph,
                               kmerSize, saveInter))
Esempio n. 15
0
    def run(self):
        logger.info("Progressive Next: " + self.event)

        if not self.schedule.isVirtual(self.event):
            self.addChildTarget(ProgressiveUp(self.options, self.project, self.event))
        followOnEvent = self.schedule.followOn(self.event)
        if followOnEvent is not None:
            self.addChildTarget(ProgressiveDown(self.options, self.project, followOnEvent,
                                                self.schedule))
Esempio n. 16
0
def createFirstJob(command, config, memory=None, cpu=None, time=sys.maxint):
    """Adds the first job to to the jobtree.
    """
    logger.info("Adding the first job")
    if memory == None:
        memory = config.attrib["default_memory"]
    if cpu == None:
        cpu = config.attrib["default_cpu"]
    job = createJob({ "command":command, "memory":str(int(memory)), "cpu":str(int(cpu)), "time":str(float(time)) }, None, config)
    writeJobs([job])
    logger.info("Added the first job")
Esempio n. 17
0
 def run(self):
     logger.info("Progressive Down: " + self.event)
     
     if not self.options.nonRecursive:
         deps = self.schedule.deps(self.event)
         for child in deps:
             self.addChildTarget(ProgressiveDown(self.options,
                                                 self.project, child, 
                                                 self.schedule))
     
     self.setFollowOnTarget(ProgressiveNext(self.options, self.project, self.event,
                                            self.schedule))
Esempio n. 18
0
 def killJobs(self, jobIDs):
     """Kills the given jobs, represented as Job ids, then checks they are dead by checking
     they are not in the list of issued jobs.
     """
     while True:
         for jobID in jobIDs:
             i = popenParasolCommand("parasol remove job %i" % jobID, tmpFileForStdOut=self.scratchFile, runUntilSuccessful=None)
             logger.info("Tried to remove jobID: %i, with exit value: %i" % (jobID, i))
         runningJobs = self.getIssuedJobIDs()
         if set(jobIDs).difference(set(runningJobs)) == set(jobIDs):
             return
         time.sleep(5)
         logger.critical("Tried to kill some jobs, but something happened and they are still going, so I'll try again")
Esempio n. 19
0
def restartFailedJobs(config, jobFiles):
    """Traverses through the file tree and resets the restart count of all jobs.
    """
    for absFileName in jobFiles:
        if os.path.isfile(absFileName):
            job = ET.parse(absFileName).getroot()
            logger.info("Restarting job: %s" % job.attrib["file"])
            job.attrib["remaining_retry_count"] = config.attrib["retry_count"]
            if job.attrib["colour"] == "red":
                job.attrib["colour"] = "white"
            #Is leaf and job failed when the system went downbut the status did not get updated.
            if job.attrib["colour"] == "grey": 
                job.attrib["colour"] = "white"
            writeJobs([ job ])
Esempio n. 20
0
def checkFileTreeCounts(rootFile):
    """Check the file tree produced by the test.
    """
    tree = ET.parse(rootFile).getroot()
    i = 0
    children = tree.find("children").findall("child")
    if len(children) == 0:
        i = 1
    else:
        for child in children:
            i += checkFileTreeCounts(child.attrib["file"])
    logger.info("File tree counts: %i %i" % (i, int(tree.attrib["count"])))
    assert i == int(tree.attrib["count"])
    return i
Esempio n. 21
0
def main():
    #Parse the inputs args/options
    parser = OptionParser(usage="usage: workingDir [options]", version="%prog 0.1")
    Stack.addJobTreeOptions(parser)
    options, args = parser.parse_args()
    setLoggingFromOptions(options)
    
    if len(args) != 1:
        raise RuntimeError("Expected one argument, got %s arguments: %s" % (len(args), " ".join(args)))
    workingDir = args[0]
    
    #Assign the input files
    readFastqFiles = [ os.path.join(workingDir, "readFastqFiles", i) for i in os.listdir(os.path.join(workingDir, "readFastqFiles")) if ".fq" in i or ".fastq" in i ]
    referenceFastaFiles = [ os.path.join(workingDir, "referenceFastaFiles", i) for i in os.listdir(os.path.join(workingDir, "referenceFastaFiles")) if ".fa" in i or ".fasta" in i ] 
    outputDir = os.path.join(workingDir, "output")
    
    #Log the inputs
    logger.info("Using the following working directory: %s" % workingDir)
    logger.info("Using the following output directory: %s" % outputDir)
    for readFastqFile in readFastqFiles:
        logger.info("Got the following read fastq file: %s" % readFastqFile)
    for referenceFastaFile in referenceFastaFiles:
        logger.info("Got the following reference fasta files: %s" % referenceFastaFile)
    
    #This line invokes jobTree  
    i = Stack(Target.makeTargetFn(setupExperiments, args=(readFastqFiles, referenceFastaFiles, mappers, analyses, outputDir))).startJobTree(options) 
    
    if i != 0:
        raise RuntimeError("Got failed jobs")
Esempio n. 22
0
def runJobTree(command, jobTreeDir, logLevel="DEBUG", retryCount=0, batchSystem="single_machine", 
               rescueJobFrequency=None):
    """A convenience function for running job tree from within a python script.
    """
    if rescueJobFrequency != None:
        rescueJobFrequencyString = "--rescueJobsFrequency %s" % float(rescueJobFrequency)
    else:
        rescueJobFrequencyString = ""
    command = "jobTree --command \"%s\" --jobTree %s --logLevel %s \
--retryCount %i --batchSystem %s %s" % \
            (command, jobTreeDir,  logLevel, retryCount, batchSystem, rescueJobFrequencyString)
    logger.info("Running command : %s" % command)
    system(command)
    logger.info("Ran the jobtree apparently okay")
Esempio n. 23
0
def reloadJobTree(jobTree):
    """Load the job tree from a dir.
    """
    logger.info("The job tree appears to already exist, so we'll reload it")
    assert os.path.isfile(os.path.join(jobTree, "config.xml")) #A valid job tree must contain the config file
    assert os.path.isfile(os.path.join(jobTree, "environ.pickle")) #A valid job tree must contain a pickle file which encodes the path environment of the job
    assert os.path.isfile(os.path.join(jobTree, "jobNumber.xml")) #A valid job tree must contain a file which is updated with the number of jobs that have been run.
    assert os.path.isdir(os.path.join(jobTree, "jobs")) #A job tree must have a directory of jobs.
    assert os.path.isdir(os.path.join(jobTree, "tempDirDir")) #A job tree must have a directory of temporary directories (for jobs to make temp files in).
    assert os.path.isdir(os.path.join(jobTree, "logFileDir")) #A job tree must have a directory of log files.
    assert os.path.isdir(os.path.join(jobTree, "slaveLogFileDir")) #A job tree must have a directory of slave log files.
    config = ET.parse(os.path.join(jobTree, "config.xml")).getroot()
    setupTempFileTrees(config)
    batchSystem = loadTheBatchSystem(config)
    logger.info("Reloaded the jobtree")
    return config, batchSystem
Esempio n. 24
0
def killMasterAndParasol():
    """Method to destroy master process
    """
    tempFile = getTempFile()
    popen("ps -a", tempFile)
    fileHandle = open(tempFile, 'r')
    line = fileHandle.readline()
    #Example parasol state lines:
    #67401 ttys002    0:00.06 /Users/benedictpaten/kent/src/parasol/bin/paraNode start -hub=localhost -log=/tmp/node.2009-07-08.log -umask=002 -userPath=bin:bin/x86_64:bin/i
    #67403 ttys002    0:00.65 /Users/benedictpaten/kent/src/parasol/bin/paraHub -log=/tmp/hub.2009-07-08.log machineList subnet=127.0.0
    #68573 ttys002    0:00.00 /Users/benedictpaten/kent/src/parasol/bin/paraNode start -hub=localhost -log=/tmp/node.2009-07-08.log -umask=002 -userPath=bin:bin/x86_64:bin/i
    while line != '':
        tokens = line.split()
        if 'paraNode' in line or 'paraHub' in line:
            if random.random() > 0.5:
                i = os.system("kill %i" % int(tokens[0]))
                logger.info("Tried to kill parasol process: %i, line: %s, exit value: %i" % (int(tokens[0]), line, i))
                break
        elif 'jobTreeMaster.py' in line:
            logger.info("Have job tree master line")
            if random.random() > 0.5:
                i = os.system("kill %i" % int(tokens[0]))
                logger.info("Tried to kill master process: %i, line: %s, exit value: %i" % (int(tokens[0]), line, i))
                break
        line = fileHandle.readline()
    fileHandle.close()
    os.remove(tempFile)
    parasolRestart()
Esempio n. 25
0
 def testJobTree_Parasol(self):
     """Runs a test program using the job tree, whilst constantly restarting parasol
     by killing the nodes.
     """
     for test in xrange(self.testNo): #Does not run this test when doing short testing
         jobTreeCommand, fileTreeRootFile = setupJobTree(self.tempFileTree, self.jobTreeDir, 
                                                         "parasol", depth=self.depth)
         jobTreeCommand += " --rescueJobsFrequency 20"
         #Run the job
         parasolAndMasterKiller = ParasolAndMasterKiller()
         parasolAndMasterKiller.start()
         while True:
             while True:
                 process = subprocess.Popen(jobTreeCommand, shell=True)
                 sts = os.waitpid(process.pid, 0)
                 if sts[1] == 0:
                     logger.info("The job tree master ended, with an okay exit value (using parasol)")
                     break
                 else:
                     logger.info("The job tree master ended with an error exit value, restarting: %i" % sts[1])
             if checkEndStateOfJobTree(self.jobTreeDir): #Check the state of the job files
                 break
             
             jobTreeCommand = "jobTreeRun --jobTree %s --logDebug" % self.jobTreeDir
         checkFileTreeCounts(fileTreeRootFile)
         os.system("rm -rf %s" % self.jobTreeDir)
         parasolAndMasterKiller.stopKilling()
         logger.info("Test done okay")
Esempio n. 26
0
def setupJobTree(tempFileTree, jobTreeDir, batchSystem, depth=2):
    """Sets up a job tree using the jobTreeSetup.py command.
    """
    #Setup a job
    retryCount = random.choice(xrange(1,10))
    
    logger.info("Setup the basic files for the test")
    
    fileTreeRootFile = tempFileTree.getTempFile()
    makeFileTree(fileTreeRootFile, depth, tempFileTree)
    treePointerFile = makeTreePointer(fileTreeRootFile, tempFileTree.getTempFile())
    
    #Setup the job
    command = "jobTreeTest_CommandFirst.py --treePointer %s --job JOB_FILE" % \
    (treePointerFile)
    
    jobTreeCommand = "jobTreeRun --jobTree %s --retryCount %i\
     --command '%s' --logLevel=INFO --maxJobDuration 100 --batchSystem %s" % \
    (jobTreeDir, retryCount, command, batchSystem)
        
    logger.info("Setup the job okay")
    return (jobTreeCommand, fileTreeRootFile)
Esempio n. 27
0
def testJobTree(testNo, depth, tempFileTree, jobTreeDir, batchSystem):
    """Runs a test program using the job tree using the single machine batch system.
    """
    for test in xrange(testNo):
        jobTreeCommand, fileTreeRootFile = setupJobTree(tempFileTree, jobTreeDir, 
                                                        batchSystem, depth=depth)
        #Run the job
        while True:
            print "job tree command", jobTreeCommand
            
            process = subprocess.Popen(jobTreeCommand, shell=True)
            sts = os.waitpid(process.pid, 0)
            assert sts[1] == 0
            logger.info("The job tree master ended, with an okay exit value")
        
            if checkEndStateOfJobTree(jobTreeDir): #Check the state of the job files, exit if none
                break
            
            jobTreeCommand = "jobTreeRun --jobTree %s --logInfo" % jobTreeDir
            
        checkFileTreeCounts(fileTreeRootFile)
        os.system("rm -rf %s" % jobTreeDir)
        logger.info("Test done okay")
Esempio n. 28
0
def createJobTree(options):
    logger.info("Starting to create the job tree setup for the first time")
    options.jobTree = os.path.abspath(options.jobTree)
    os.mkdir(options.jobTree)
    config = ET.Element("config")
    config.attrib["environment_file"] = os.path.join(options.jobTree, "environ.pickle")
    config.attrib["job_number_file"] = os.path.join(options.jobTree, "jobNumber.xml")
    config.attrib["job_file_dir"] = os.path.join(options.jobTree, "jobs")
    config.attrib["temp_dir_dir"] = os.path.join(options.jobTree, "tempDirDir")
    config.attrib["log_file_dir"] = os.path.join(options.jobTree, "logFileDir")
    config.attrib["slave_log_file_dir"] = os.path.join(options.jobTree, "slaveLogFileDir")
    config.attrib["results_file"] = os.path.join(options.jobTree, "results.txt")
    config.attrib["scratch_file"] = os.path.join(options.jobTree, "scratch.txt")
    config.attrib["retry_count"] = str(int(options.retryCount))
    config.attrib["max_job_duration"] = str(float(options.maxJobDuration))
    config.attrib["batch_system"] = options.batchSystem
    config.attrib["job_time"] = str(float(options.jobTime))
    config.attrib["max_log_file_size"] = str(int(options.maxLogFileSize))
    config.attrib["default_memory"] = str(int(options.defaultMemory))
    config.attrib["default_cpu"] = str(int(options.defaultCpu))
    config.attrib["max_jobs"] = str(int(options.maxJobs))
    config.attrib["max_threads"] = str(int(options.maxThreads))
    if options.stats:
        config.attrib["stats"] = os.path.join(options.jobTree, "stats.xml")
        fileHandle = open(config.attrib["stats"], 'w')
        fileHandle.write("<stats>")
        fileHandle.close()
    #Load the batch system.
    batchSystem = loadTheBatchSystem(config)
    
    #Set the two parameters determining the polling frequency of the system.
    config.attrib["wait_duration"] = str(float(batchSystem.getWaitDuration()))
    if options.waitDuration != None:
        config.attrib["wait_duration"] = str(float(options.waitDuration))
        
    config.attrib["rescue_jobs_frequency"] = str(float(batchSystem.getRescueJobFrequency()))
    if options.rescueJobsFrequency != None:
        config.attrib["rescue_jobs_frequency"] = str(float(options.rescueJobsFrequency))
    
    #Write the config file to disk
    fileHandle = open(os.path.join(options.jobTree, "config.xml"), 'w')
    
    tree = ET.ElementTree(config)
    tree.write(fileHandle)
    fileHandle.close()
    logger.info("Written the config file")
    
    #Set up the jobNumber file
    fileHandle = open(config.attrib["job_number_file"], 'w')
    ET.ElementTree(ET.Element("job_number", { "job_number":'0' })).write(fileHandle)
    fileHandle.close()
    
    #Setup the temp file trees.
    setupTempFileTrees(config)
    
    logger.info("Finished the job tree setup")
    return config, batchSystem
Esempio n. 29
0
def main():
    parser = OptionParser()
    Stack.addJobTreeOptions(parser)
    options, args = parser.parse_args()
    setLoggingFromOptions(options)
    
    outputDir = "muscle_compare_2d/output/"

    if not os.path.exists(outputDir):
        logger.info("Output dir {} does not exist. Creating.")
        os.mkdir(outputDir)
    if len(os.listdir(outputDir)) > 0:
        logger.info("Output dir not empty.")

    if len(args) != 3:
        raise RuntimeError("Error: expected three arguments got %s arguments: %s" % (len(args), " ".join(args)))

    templateRecords = {x.qname for x in pysam.Samfile(args[0]) if not x.is_unmapped}
    complementRecords = {x.qname for x in pysam.Samfile(args[1]) if not x.is_unmapped}
    
    twodSamFile = pysam.Samfile(args[2])
    twodRecords = {x.qname : x for x in twodSamFile if not x.is_unmapped}

    recordsToAnalyze = dict()
    for name, record in twodRecords.iteritems():
        if name not in templateRecords and name not in complementRecords:
            ref_name = twodSamFile.getrname(record.tid)
            ref_start, ref_stop = int(record.aend - record.alen), int(record.aend)
            recordsToAnalyze[name] = [ref_name, ref_start, ref_stop]
    if os.path.exists("../readFastqFiles/template/") and os.path.exists("../readFastqFiles/complement"):
        templateFastqFiles = [os.path.join("../readFastqFiles/template/", x) for x in os.listdir("../readFastqFiles/template/") if x.endswith(".fastq") or x.endswith(".fq")]
        complementFastqFiles = [os.path.join("../readFastqFiles/complement/", x) for x in os.listdir("../readFastqFiles/complement/") if x.endswith(".fastq") or x.endswith(".fq")]
    else:
        raise RuntimeError("Error: readFastqFiles does not contain template and/or complement folders")

    referenceFastaFiles = [os.path.join("../referenceFastaFiles", x) for x in os.listdir("../referenceFastaFiles") if x.endswith(".fa") or x.endswith(".fasta")]
    
    if len(referenceFastaFiles) > 0:
        references = { y[0].split(" ")[0] : y[1] for x in referenceFastaFiles for y in fastaRead(x) }
    else:
        raise RuntimeError("Error: no reference fasta files")

    if len(recordsToAnalyze) == 0:
        raise RuntimeError("Error: none of the mappable twoD reads in this set did not map as template/complement.")

    logger.info("Starting to find analyses to run...")
    args = (recordsToAnalyze, templateFastqFiles, complementFastqFiles, references, outputDir)
    i = Stack(Target.makeTargetFn(find_analyses, args=args)).startJobTree(options) 

    if i != 0:
        raise RuntimeError("Got {} failed jobs".format(i))
Esempio n. 30
0
def main():
    ## Make sure we're in the right type of directory
    assert os.path.exists("jobs.list")
    assert os.path.exists("jobsEM.list")
    assert os.path.exists("config.txt")
    assert os.path.exists("configEM.txt")
    assert os.path.exists("params0.txt")

    assert commandAvailable(collectParamsExec)
    assert commandAvailable(mergeSwarm)
    assert commandAvailable(mergeMerge)

    ##
    ## Parse options
    ##
    parser = OptionParser()
    Stack.addJobTreeOptions(parser)  # so that the stack will work
    parser.add_option("--jobFile",
                      help="Add as a child of jobFile rather " +
                      "than making a new jobTree")
    options, args = parser.parse_args()
    print "Using Batch System '" + options.batchSystem + "'"
    assert len(args) == 0 or len(args) == 1

    tolerance = 0.001
    if len(args) == 1:
        tolerance = float(args[0])

    logger.info("options: " + str(options))

    ##
    ## Run
    ##
    logger.info("starting first EM iteration")
    s = Stack(ExpectationIteration(0, tolerance, os.getcwd()))
    if options.jobFile:
        s.addToJobFile(options.jobFile)
    else:
        if options.jobTree == None:
            options.jobTree = "./.jobTree"

        failed = s.startJobTree(options)
        if failed:
            print("%d jobs failed" % failed)
        else:
            logger.info("Run complete!")
Esempio n. 31
0
def main():
    parser = getBasicOptionParser("usage: %prog [options]", "%prog 0.1")
    
    parser.add_option("--jobTree", dest="jobTree", 
                      help="Directory containing the job tree to kill")
    
    options, args = parseBasicOptions(parser)
    logger.info("Parsed arguments")
    assert len(args) == 0 #This program takes no arguments
    assert options.jobTree != None #The jobtree should not be null
    assert os.path.isdir(options.jobTree) #The job tree must exist if we are going to kill it.
    logger.info("Starting routine to kill running jobs in the jobTree: %s" % options.jobTree)
    config = ET.parse(os.path.join(options.jobTree, "config.xml")).getroot()
    batchSystem = loadTheBatchSystem(config) #This should automatically kill the existing jobs.. so we're good.
    for job in batchSystem.getIssuedJobIDs(): #Just in case we do it again.
        batchSystem.killJobs(job)
    logger.info("All jobs SHOULD have been killed")
Esempio n. 32
0
def main():
    ## Make sure we're in the right type of directory
    assert os.path.exists("jobs.list")
    assert os.path.exists("jobsEM.list")
    assert os.path.exists("config.txt")
    assert os.path.exists("configEM.txt")
    assert os.path.exists("params0.txt")

    assert commandAvailable(collectParamsExec)
    assert commandAvailable(mergeSwarm)
    assert commandAvailable(mergeMerge)

    ##
    ## Parse options
    ##
    parser = OptionParser()
    Stack.addJobTreeOptions(parser) # so that the stack will work
    parser.add_option("--jobFile", help="Add as a child of jobFile rather " +
                      "than making a new jobTree")
    options, args = parser.parse_args()
    print "Using Batch System '" + options.batchSystem + "'"
    assert len(args) == 0 or len(args) == 1

    tolerance = 0.001
    if len(args) == 1:
        tolerance = float(args[0])

    logger.info("options: " + str(options))

    ##
    ## Run
    ##
    logger.info("starting first EM iteration")
    s = Stack(ExpectationIteration(0, tolerance, os.getcwd()))
    if options.jobFile:
        s.addToJobFile(options.jobFile)
    else:
        if options.jobTree == None:
            options.jobTree = "./.jobTree"
        
        failed = s.startJobTree(options)
        if failed:
            print ("%d jobs failed" % failed)
        else:
            logger.info("Run complete!")
Esempio n. 33
0
def loadTheBatchSystem(config):
    """Load the batch system.
    """
    batchSystemString = config.attrib["batch_system"]
    if batchSystemString == "parasol":
        batchSystem = ParasolBatchSystem(config)
        logger.info("Using the parasol batch system")
    elif batchSystemString == "single_machine" or batchSystemString == "singleMachine":
        batchSystem = SingleMachineBatchSystem(config)
        logger.info("Using the single machine batch system")
    elif batchSystemString == "gridengine" or batchSystemString == "gridEngine":
        batchSystem = GridengineBatchSystem(config)
        logger.info("Using the grid engine machine batch system")
    elif batchSystemString == "acid_test" or batchSystemString == "acidTest":
        batchSystem = SingleMachineBatchSystem(config, workerClass=BadWorker)
        config.attrib["retry_count"] = str(32) #The chance that a job does not complete after 32 goes in one in 4 billion, so you need a lot of jobs before this becomes probable
    else:
        raise RuntimeError("Unrecognised batch system: %s" % batchSystemString)
    return batchSystem
Esempio n. 34
0
 def __init__(self, config):
     AbstractBatchSystem.__init__(self, config) #Call the parent constructor
     #Keep the name of the results file for the pstat2 command..
     self.parasolResultsFile = config.attrib["results_file"]
     #Reset the job queue and results (initially, we do this again once we've killed the jobs)
     self.parasolResultsFileHandle = open(self.parasolResultsFile, 'w')
     self.parasolResultsFileHandle.close() #We lose any previous state in this file, and ensure the files existence    
     self.queuePattern = re.compile("q\s+([0-9]+)")
     self.runningPattern = re.compile("r\s+([0-9]+)\s+[\S]+\s+[\S]+\s+([0-9]+)\s+[\S]+")
     #The scratch file
     self.scratchFile = self.config.attrib["scratch_file"]
     self.killJobs(self.getIssuedJobIDs()) #Kill any jobs on the current stack
     logger.info("Going to sleep for a few seconds to kill any existing jobs")
     time.sleep(5) #Give batch system a second to sort itself out.
     logger.info("Removed any old jobs from the queue")
     #Reset the job queue and results
     self.parasolResultsFileHandle = open(self.parasolResultsFile, 'w')
     self.parasolResultsFileHandle.close() #We lose any previous state in this file, and ensure the files existence
     self.parasolResultsFileHandle = open(self.parasolResultsFile, 'r')
     logger.info("Reset the results queue")
Esempio n. 35
0
def main():
    #Parse the inputs args/options
    parser = OptionParser(usage="usage: workingDir [options]",
                          version="%prog 0.1")
    Stack.addJobTreeOptions(parser)
    options, args = parser.parse_args()
    setLoggingFromOptions(options)

    if len(args) != 1:
        raise RuntimeError("Expected one argument, got %s arguments: %s" %
                           (len(args), " ".join(args)))
    workingDir = args[0]

    # call read sampler script; samples 75, 50, and 25% reads
    #SampleReads(workingDir)

    #Create (if necessary) the output dir
    outputDir = os.path.join(workingDir, "output")
    if not os.path.exists(outputDir):
        logger.info("Creating output dir: %s" % outputDir)
        os.mkdir(outputDir)
    else:
        logger.info("Root output dir already exists: %s" % outputDir)

    #Assign/process (uniquify the names of) the input read fastq files
    processedFastqFiles = os.path.join(outputDir, "processedReadFastqFiles")
    if not os.path.exists(processedFastqFiles):
        os.mkdir(processedFastqFiles)

    fastqParentDir = os.path.join(workingDir, "readFastqFiles")
    readFastqFiles = list()
    for fastqSubDir in filter(
            os.path.isdir,
        [os.path.join(fastqParentDir, x) for x in os.listdir(fastqParentDir)]):
        readType = os.path.basename(fastqSubDir)
        if not os.path.exists(
                os.path.join(processedFastqFiles,
                             os.path.basename(fastqSubDir))):
            os.mkdir(os.path.join(processedFastqFiles, readType))
        readFastqFiles.append([
            readType,
            [
                makeFastqSequenceNamesUnique(
                    os.path.join(workingDir, "readFastqFiles", readType, i),
                    os.path.join(processedFastqFiles, readType, i))
                for i in os.listdir(
                    os.path.join(workingDir, "readFastqFiles", readType))
                if (".fq" in i and i[-3:] == '.fq') or (
                    ".fastq" in i and i[-6:] == '.fastq')
            ]
        ])

    #Assign/process (uniquify the names of) the input reference fasta files
    processedFastaFiles = os.path.join(outputDir,
                                       "processedReferenceFastaFiles")
    if not os.path.exists(processedFastaFiles):
        os.mkdir(processedFastaFiles)
    referenceFastaFiles = [
        makeFastaSequenceNamesUnique(
            os.path.join(workingDir, "referenceFastaFiles", i),
            os.path.join(processedFastaFiles, i))
        for i in os.listdir(os.path.join(workingDir, "referenceFastaFiles"))
        if (".fa" in i and i[-3:] == '.fa') or (
            ".fasta" in i and i[-6:] == '.fasta')
    ]

    # call reference mutator script; introduces 1%, and 5% mutations (No nucleotide bias used for now)
    #referenceFastaFiles = mutateReferenceSequences(referenceFastaFiles)

    #Log the inputs
    logger.info("Using the following working directory: %s" % workingDir)
    logger.info("Using the following output directory: %s" % outputDir)
    for readType, readTypeFastqFiles in readFastqFiles:
        logger.info("Got the follow read type: %s" % readType)
        for readFastqFile in readTypeFastqFiles:
            logger.info("Got the following read fastq file: %s" %
                        readFastqFile)
    for referenceFastaFile in referenceFastaFiles:
        logger.info("Got the following reference fasta files: %s" %
                    referenceFastaFile)

    #This line invokes jobTree
    i = Stack(
        Target.makeTargetFn(setupExperiments,
                            args=(readFastqFiles, referenceFastaFiles, mappers,
                                  analyses, metaAnalyses,
                                  outputDir))).startJobTree(options)

    if i != 0:
        raise RuntimeError("Got failed jobs")
Esempio n. 36
0
def jtFitness():
    ## parse arguments
    parser = OptionParser()
    Stack.addJobTreeOptions(parser)
    parser.add_option("--jobFile", help = "Add as a child of jobFile rather " +
                      "than making a new jobTree")
    parser.add_option("-d", "--dogma", dest="dogmaPath", default="")
    parser.add_option("-p", "--pathway", dest="pathwayPath", default="")
    parser.add_option("-b", "--boundaries", dest="discBound", default="")
    parser.add_option("-s", "--shuffle", dest="shuffleNode", default="")
    parser.add_option("-n", "--nulls", dest="nNulls", default="10")
    parser.add_option("-t", "--storedparam", dest="paramFile", default="")
    
    options, args = parser.parse_args()
    print "Using Batch System '" + options.batchSystem + "'"
   
    evidList = args 
    if (len(evidList) % 2 == 1) | (len(evidList) == 0):
        sys.stderr.write("ERROR: incorrect number of arguments\n")
        sys.exit(1)
    
    if len(options.discBound) == 0:
        disc = "0.3333;0.6667"
    else:
        disc = options.discBound
    if len(options.dogmaPath) == 0:
        dogma = "%s/%s" % (dogmaDir, dogmaDefault)
    else:
        dogma = options.dogmaPath
        if not dogma.startswith("/"):
            dogma = "%s/%s" % (os.getcwd(), dogma)        
    if len(options.pathwayPath) == 0:
        pathway = "%s/%s" % (pathwayDir, pathwayDefault)
    else:
        pathway = options.pathwayPath
        if not pathway.startswith("/"):
            pathway = "%s/%s" % (os.getcwd(), pathway)
    if len(options.shuffleNode) == 0:
        shuffleNode = "NULL"
    else:
        shuffleNode = options.shuffleNode
    nShuffle = int(options.nNulls)
    if len(options.paramFile) == 0:
        paramFile = None
    else:
        paramFile = options.paramFile

    ## clean
    if len(args) == 1:
        if args[0] == "clean":
            print "rm -rf .jobTree fold*"
            os.system("rm -rf .jobTree fold*")
            sys.exit(0)
    
    ## run
    logger.info("options: " + str(options))
    s = Stack(branchFolds(" ".join(evidList), disc, paramFile, paradigmExec, inferSpec, dogma, pathway, shuffleNode, nShuffle, mFolds, os.getcwd()))
    if options.jobFile:
        s.addToJobFile(options.jobFile)
    else:
        if options.jobTree == None:
            options.jobTree = "./.jobTree"
        
        failed = s.startJobTree(options)
        if failed:
            print ("%d jobs failed" % failed)
        else:
            logger.info("Run complete!")
            system("rm -rf .lastjobTree")
            system("mv .jobTree .lastjobTree")
Esempio n. 37
0
def main():
    ## parse arguments
    parser = OptionParser(usage="%prog [options] network IPL-matrix features")
    Stack.addJobTreeOptions(parser)
    parser.add_option("--jobFile",
                      help="Add as a child of jobFile rather " +
                      "than making a new jobTree")
    parser.add_option("-w",
                      "--workdir",
                      dest="workdir",
                      help="Common Work directory",
                      default="./")
    parser.add_option("-i", "--ipl", dest="iplFile", default=None)
    parser.add_option("-p", "--pathway", dest="pathwayZip", default=None)
    parser.add_option("-c", "--phenotype", dest="phenotypeFile", default=None)
    parser.add_option("-o", "--oz", dest="outputZip", default=None)
    parser.add_option("-s", "--score", dest="scoreFile", default=None)
    parser.add_option("-f", "--filter", dest="filterParams", default="0.0;0.0")
    parser.add_option("-b", "--background", dest="nBackground", default="0")
    options, args = parser.parse_args()
    print "Using Batch System '" + options.batchSystem + "'"

    ## clean
    if len(args) == 1:
        if args[0] == "clean":
            print "rm -rf real* null* OCCAM__* LAYOUT background.R .jobTree"
            system("rm -rf real* null* OCCAM__* LAYOUT background.R .jobTree")
            sys.exit(0)

    ## parse arguments
    assert ((len(args) == 0) or (len(args) == 2) or (len(args) == 3))
    if len(args) == 0:
        pathwayZip = options.pathwayZip if options.pathwayZip is not None else basepathway
        pathwayLib = os.path.join(options.workdir, "pathway")
        system("unzip %s -d %s" % (pathwayZip, pathwayLib))
        paradigmPathway = None
        for file in os.listdir(pathwayLib):
            if file.endswith("_pathway.tab"):
                paradigmPathway = "%s/%s" % (pathwayLib, file)
                break
        scoreFile = None
        phenotypeFile = options.phenotypeFile
        dataFile = options.iplFile
        sampleList = []
        for sample in retColumns(dataFile):
            if not sample.startswith("na_iter"):
                sampleList.append(sample)
        filterParams = options.filterParams
        nNulls = int(options.nBackground)
        outputZip = options.outputZip
        assert (os.path.exists(paradigmPathway))
        assert (os.path.exists(phenotypeFile))
        assert (os.path.exists(dataFile))
    elif len(args) == 2:
        paradigmPathway = args[0]
        scoreFile = args[1]
        phenotypeFile = None
        dataFile = None
        sampleList = None
        filterParams = options.filterParams
        nNulls = 0
        outputZip = options.outputZip
        assert (os.path.exists(paradigmPathway))
        assert (os.path.exists(scoreFile))
    elif len(args) == 3:
        paradigmPathway = args[0]
        scoreFile = None
        phenotypeFile = args[2]
        dataFile = args[1]
        sampleList = []
        for sample in retColumns(dataFile):
            if not sample.startswith("na_iter"):
                sampleList.append(sample)
        filterParams = options.filterParams
        nNulls = int(options.nBackground)
        outputZip = options.outputZip
        assert (os.path.exists(paradigmPathway))
        assert (os.path.exists(phenotypeFile))
        assert (os.path.exists(dataFile))

    ## run
    logger.info("options: " + str(options))
    logger.info("starting make")
    writeScripts()
    s = Stack(
        prepareOCCAM(paradigmPathway, scoreFile, phenotypeFile, None, dataFile,
                     sampleList, filterParams, nNulls, outputZip, os.getcwd()))
    if options.jobFile:
        s.addToJobFile(options.jobFile)
    else:
        if options.jobTree == None:
            options.jobTree = "./.jobTree"

        failed = s.startJobTree(options)
        if failed:
            print("%d jobs failed" % failed)
        else:
            logger.info("Run complete!")
            system("rm -rf .lastjobTree")
            system("mv .jobTree .lastjobTree")
Esempio n. 38
0
def wrapParadigm():
    ## parse arguments
    parser = OptionParser(usage = "%prog [options] attachment file:path [attachment file:path ...]")
    Stack.addJobTreeOptions(parser)
    parser.add_option("--jobFile", help = "Add as a child of jobFile rather " +
                      "than making a new jobTree")
    parser.add_option("-w", "--workdir", dest="workdir", help="Common Work directory", default="./")
    parser.add_option("-n", "--nulls", dest="nulls", help="Number of Null Samples", default="5")
    parser.add_option("-d", "--dogma", dest="dogmazip", help="Path to PARADIGM Dogma Specification", default=basedogma)
    parser.add_option("-p", "--pathway", dest="pathwayzip", help="Path to PARADIGM Pathway Specification", default=basepathway)
    parser.add_option("-b", "--boundaries", dest="disc", help="Data Discretization Bounds", default="0.33;0.67")
    parser.add_option("-t", "--storedparam", dest="param", help="Initial Parameter Starting Point", default=None)
    parser.add_option("-s", "--skipem", action="store_false", dest="em", help="Skip Running EM", default=True)
    
    parser.add_option("--fr", "--filter-real", dest="filtered_real", help="Filtered Output", default=None)
    parser.add_option("--fa", "--filter-all", dest="filtered_all", help="Filtered Output", default=None)
    parser.add_option("--ur", "--unfilter-real", dest="unfiltered_real", help="Filtered Output", default=None)
    parser.add_option("--ua", "--unfilter-all", dest="unfiltered_all", help="Filtered Output", default=None)
    
    options, args = parser.parse_args()
    logger.info("options: " + str(options))
    print "Using Batch System '" + options.batchSystem + "'"
    
    evidList = []
    for i, element in enumerate(args):
        if i % 2 == 1:
            (fileType, filePath) = args[i].split(":")
            evidList.append("%s:%s" % (fileType, os.path.abspath(filePath)))
        else:
            evidList.append(args[i])
    
    if (len(evidList) % 2 == 1) | (len(evidList) == 0):
        sys.stderr.write("ERROR: incorrect number of arguments\n")
        sys.exit(1)
    
 
    workdir = os.path.abspath(options.workdir)
    nullBatches = int(options.nulls)
    dogmaZip=os.path.abspath(options.dogmazip)
    pathwayZip=os.path.abspath(options.pathwayzip)
    disc=options.disc
    paramFile=os.path.abspath(options.param) if options.param is not None else None
    runEM = options.em
    
    dogmaLib = os.path.join(workdir, "dogma")
    pathwayLib = os.path.join(workdir, "pathway")
    system("unzip %s -d %s" % (dogmaZip, dogmaLib))
    system("unzip %s -d %s" % (pathwayZip, pathwayLib))

    ## run
    logger.info("starting prepare")
    s = Stack(prepareParadigm(" ".join(evidList), disc, paramFile, nullBatches, paradigmExec, inferSpec, dogmaLib, pathwayLib, runEM, workdir))
    if options.jobFile:
        s.addToJobFile(options.jobFile)
    else:
        if options.jobTree == None:
            options.jobTree = "./.jobTree"
        
        failed = s.startJobTree(options)
        if failed:
            print ("%d jobs failed" % failed)
        else:
            logger.info("Run complete!")
            system("rm -rf .lastjobTree")
            system("mv .jobTree .lastjobTree")
Esempio n. 39
0
def main():
    parser = OptionParser()
    Stack.addJobTreeOptions(parser)
    options, args = parser.parse_args()
    setLoggingFromOptions(options)

    outputDir = "muscle_compare_2d/output/"

    if not os.path.exists(outputDir):
        logger.info("Output dir {} does not exist. Creating.")
        os.mkdir(outputDir)
    if len(os.listdir(outputDir)) > 0:
        logger.info("Output dir not empty.")

    if len(args) != 3:
        raise RuntimeError(
            "Error: expected three arguments got %s arguments: %s" %
            (len(args), " ".join(args)))

    templateRecords = {
        x.qname
        for x in pysam.Samfile(args[0]) if not x.is_unmapped
    }
    complementRecords = {
        x.qname
        for x in pysam.Samfile(args[1]) if not x.is_unmapped
    }

    twodSamFile = pysam.Samfile(args[2])
    twodRecords = {x.qname: x for x in twodSamFile if not x.is_unmapped}

    recordsToAnalyze = dict()
    for name, record in twodRecords.iteritems():
        if name not in templateRecords and name not in complementRecords:
            ref_name = twodSamFile.getrname(record.tid)
            ref_start, ref_stop = int(record.aend - record.alen), int(
                record.aend)
            recordsToAnalyze[name] = [ref_name, ref_start, ref_stop]
    if os.path.exists("../readFastqFiles/template/") and os.path.exists(
            "../readFastqFiles/complement"):
        templateFastqFiles = [
            os.path.join("../readFastqFiles/template/", x)
            for x in os.listdir("../readFastqFiles/template/")
            if x.endswith(".fastq") or x.endswith(".fq")
        ]
        complementFastqFiles = [
            os.path.join("../readFastqFiles/complement/", x)
            for x in os.listdir("../readFastqFiles/complement/")
            if x.endswith(".fastq") or x.endswith(".fq")
        ]
    else:
        raise RuntimeError(
            "Error: readFastqFiles does not contain template and/or complement folders"
        )

    referenceFastaFiles = [
        os.path.join("../referenceFastaFiles", x)
        for x in os.listdir("../referenceFastaFiles")
        if x.endswith(".fa") or x.endswith(".fasta")
    ]

    if len(referenceFastaFiles) > 0:
        references = {
            y[0].split(" ")[0]: y[1]
            for x in referenceFastaFiles for y in fastaRead(x)
        }
    else:
        raise RuntimeError("Error: no reference fasta files")

    if len(recordsToAnalyze) == 0:
        raise RuntimeError(
            "Error: none of the mappable twoD reads in this set did not map as template/complement."
        )

    logger.info("Starting to find analyses to run...")
    args = (recordsToAnalyze, templateFastqFiles, complementFastqFiles,
            references, outputDir)
    i = Stack(Target.makeTargetFn(find_analyses,
                                  args=args)).startJobTree(options)

    if i != 0:
        raise RuntimeError("Got {} failed jobs".format(i))
Esempio n. 40
0
def main():
    parser = OptionParser()
    Stack.addJobTreeOptions(parser)
    options, args = parser.parse_args()
    setLoggingFromOptions(options)

    outputDir = "blast_combined/output/"

    if not os.path.exists(outputDir):
        logger.info("Output dir {} does not exist. Creating.")
        os.mkdir(outputDir)
    if len(os.listdir(outputDir)) > 0:
        logger.info("Output dir not empty.")

    #find all read fastq files, load into a dict by read type
    readFastqFiles = dict()
    for readType in readTypes:
        readFastqFiles[readType] = [
            os.path.join("../output/processedReadFastqFiles/", readType, x)
            for x in os.listdir(
                os.path.join("../output/processedReadFastqFiles/", readType))
            if x.endswith(".fq") or x.endswith(".fastq")
        ]

    #find all reference fasta files
    referenceFastaFiles = [
        x for x in os.listdir("../referenceFastaFiles")
        if x.endswith(".fasta") or x.endswith(".fa")
    ]

    #find all sam files that were analyzed using combinedAnalyses
    samFiles = {}
    for readType in readTypes:
        samFiles[readType] = [
            (readFastqFile,
             os.path.join(
                 "../output", "analysis_" + readType,
                 "experiment_" + os.path.basename(readFastqFile) + "_" +
                 referenceFastaFile + "_" + analysis, "mapping.sam"))
            for readFastqFile, referenceFastaFile, analysis in product(
                readFastqFiles[readType], referenceFastaFiles,
                combinedAnalyses)
        ]

    mappedByReadType = defaultdict(set)
    for readType in readTypes:
        for readFastqFileFullPath, samFile in samFiles[readType]:
            readFastqFile = os.path.basename(readFastqFileFullPath)
            mappedNames = {(x.qname, readFastqFile)
                           for x in pysam.Samfile(samFile)
                           if not x.is_unmapped}
            mappedByReadType[readType] = mappedByReadType[readType].union(
                mappedNames)

    unmappedByReadType = defaultdict(dict)
    for readType in readTypes:
        for readFastqFileFullPath, samFile in samFiles[readType]:
            readFastqFile = os.path.basename(readFastqFileFullPath)
            for name, seq, qual in fastqRead(readFastqFileFullPath):
                name = name.split(" ")[0]
                if (name, readFastqFile) not in mappedByReadType[readType]:
                    unmappedByReadType[readType][(name, readFastqFile)] = seq

    i = Stack(
        Target.makeTargetFn(find_analyses,
                            args=(unmappedByReadType,
                                  outputDir))).startJobTree(options)

    if i != 0:
        raise RuntimeError("Got {} failed jobs".format(i))

    for readType in readTypes:
        #build a counter of blast hits and set of read names that did not map
        blast_hits, no_hits = Counter(), set()
        for query, result in parse_blast(
                open(os.path.join(outputDir, readType + "_blast_out.txt"))):
            if result is None:
                no_hits.add(query)
            else:
                blast_hits[tuple(
                    result)] += 1  #count number of times each hit was seen
        #write the unmapped hits to a fasta file
        outf = open(os.path.join(outputDir, readType + "_no_hits.fasta"), "w")
        for (name,
             readFastqFile), seq in unmappedByReadType[readType].iteritems():
            if name in no_hits:
                outf.write(">{}\n{}\n".format(name, seq))
        outf.close()
        #write the blast report
        blast_out = open(
            os.path.join(outputDir, readType + "_blast_report.txt"), "w")
        blast_out.write(
            "gi|##|gb|##|\tSpecies\tseqID\tCount\n")  #header to output
        for result, count in sorted(blast_hits.items(),
                                    key=lambda x: -int(x[-1])):
            blast_out.write("{}\t{}\n".format("\t".join(result), count))
        blast_out.close()
        #calculate percents and make a barplot
        blast_count = sum(blast_hits.values())
        unmapped_count = len(unmappedByReadType[readType]) - sum(
            blast_hits.values())
        mapped_count = len(mappedByReadType[readType])

        #blast_percent = 1.0 * sum(blast_hits.values()) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType]))
        #unmapped_percent = (1.0 * len(unmappedByReadType[readType]) - sum(blast_hits.values())) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType]))
        #mapped_percent = 1.0 * len(mappedByReadType[readType]) / (len(mappedByReadType[readType]) + len(unmappedByReadType[readType]))
        outf = open(os.path.join(outputDir, readType + "percents.txt"), "w")
        outf.write("\n".join(
            map(str, [blast_count, unmapped_count, mapped_count])))
        outf.close()
        #system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format(blast_percent, unmapped_percent, mapped_percent, readType, os.path.join(outputDir, readType + "_blast_barplot.pdf")))
        system("Rscript blast_combined/barplot_blast.R {} {} {} {} {}".format(
            blast_count, unmapped_count, mapped_count, readType,
            os.path.join(outputDir, readType + "_blast_barplot.pdf")))
Esempio n. 41
0
def wrapParadigm():
    ## parse arguments
    parser = OptionParser(
        usage="%prog [options] attachment file:path [attachment file:path ...]"
    )
    Stack.addJobTreeOptions(parser)
    parser.add_option("--jobFile",
                      help="Add as a child of jobFile rather " +
                      "than making a new jobTree")
    parser.add_option("-d",
                      "--dogma",
                      dest="dogmaPath",
                      help="Path to PARADIGM Dogma Specification",
                      default="")
    parser.add_option("-p",
                      "--pathway",
                      dest="pathwayPath",
                      help="Path to PARADIGM Pathway Specification",
                      default="")
    parser.add_option("-b",
                      "--boundaries",
                      dest="discBound",
                      help="Data Discretization Bounds",
                      default="")
    parser.add_option("-n",
                      "--nulls",
                      dest="nullBatches",
                      help="Number of Null Samples",
                      default="5")
    parser.add_option("-t",
                      "--storedparam",
                      dest="paramFile",
                      help="Initial Parameter Starting Point",
                      default="")
    parser.add_option("-s",
                      "--skipem",
                      action="store_false",
                      dest="runEM",
                      help="Skip Running EM",
                      default=True)
    options, args = parser.parse_args()
    print "Using Batch System '" + options.batchSystem + "'"

    evidList = []
    for element in args:
        if element.startswith("rankAllFile"):
            evidList.append(re.sub("rankAllFile", "file", element))
        else:
            evidList.append(element)

    if (len(evidList) % 2 == 1) | (len(evidList) == 0):
        sys.stderr.write("ERROR: incorrect number of arguments\n")
        sys.exit(1)

    if len(options.discBound) == 0:
        disc = "0.3333;0.6667"
    else:
        disc = options.discBound
    if len(options.dogmaPath) == 0:
        dogma = "%s/%s" % (dogmaDir, dogmaDefault)
    else:
        dogma = options.dogmaPath
    if len(options.pathwayPath) == 0:
        pathway = "%s/%s" % (pathwayDir, pathwayDefault)
    else:
        pathway = options.pathwayPath
    nullBatches = int(options.nullBatches)
    if len(options.paramFile) == 0:
        paramFile = None
    else:
        paramFile = options.paramFile
    runEM = options.runEM
    logger.info("options: " + str(options))

    ## run
    logger.info("starting prepare")
    s = Stack(
        prepareParadigm(" ".join(evidList), disc, paramFile, nullBatches,
                        paradigmExec, inferSpec, dogma, pathway, runEM,
                        os.getcwd()))
    if options.jobFile:
        s.addToJobFile(options.jobFile)
    else:
        if options.jobTree == None:
            options.jobTree = "./.jobTree"

        failed = s.startJobTree(options)
        if failed:
            print("%d jobs failed" % failed)
        else:
            logger.info("Run complete!")
            system("rm -rf .lastjobTree")
            system("mv .jobTree .lastjobTree")