Beispiel #1
0
    def run(self, fileStore):
        self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()

        # Log the stats for the un-preprocessed assemblies
        for name, sequence in self.project.getInputSequenceIDMap().items():
            self.addChildJobFn(logAssemblyStats, "Before preprocessing", name, sequence)

        # Create jobs to create the output sequences
        logger.info("Reading config file from: %s" % self.project.getConfigID())
        configFile = fileStore.readGlobalFile(self.project.getConfigID())
        configNode = ET.parse(configFile).getroot()
        ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals() #This is necessary..
        #Add the preprocessor child job. The output is a job promise value that will be
        #converted into a list of the IDs of the preprocessed sequences in the follow on job.
        preprocessorJob = self.addChild(CactusPreprocessor(self.project.getInputSequenceIDs(), configNode))
        self.project.setOutputSequenceIDs([preprocessorJob.rv(i) for i in range(len(self.project.getInputSequenceIDs()))])

        #Now build the progressive-down job
        schedule = Schedule()
        schedule.loadProject(self.project, fileStore=fileStore)
        schedule.compute()
        self.options.event = self.project.mcTree.getRootName()
        leafNames = [ self.project.mcTree.getName(i) for i in self.project.mcTree.getLeaves() ]
        fileStore.logToMaster("Leaf names = %s" % leafNames)
        self.options.globalLeafEventSet = set(leafNames)

        return self.addFollowOn(RunCactusPreprocessorThenProgressiveDown2(options=self.options, project=self.project, event=self.options.event, schedule=schedule, memory=self.configWrapper.getDefaultMemory())).rv()
Beispiel #2
0
def main():
    usage = "usage: %prog [options] <experiment> <output project path>"
    description = "Setup a multi-cactus project using an experiment xml as template"
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("--fixNames", dest="fixNames",  default = "True", 
                      help="try to make sequence and event names MAF-compliant [default=true]")
    parser.add_option("--outgroupNames", dest="outgroupNames",  default = None, 
                      help="comma-separated names of high quality assemblies to use as outgroups [default=everything]")
    parser.add_option("--root", dest="root", type=str,
                      help="name of alignment root (must be labeled ancestral node in tree in input experiment).  Useful "
                      "for allowing the tree to contain nodes that won't be in the alignment but can still be used for "
                      "outgroups.",
                      default=None)
    parser.add_option("--overwrite", action="store_true", help="Overwrite existing experiment files", default=False)

    options, args = parser.parse_args()
    
    if len(args) != 2:
        parser.print_help()
        raise RuntimeError("Wrong number of arguments")

    options.expFile = args[0]    
    options.path = os.path.abspath(args[1])
    options.name = os.path.basename(options.path)
    options.fixNames = not options.fixNames.lower() == "false"

    if (os.path.isdir(options.path) and not options.overwrite) or os.path.isfile(options.path):
        raise RuntimeError("Output project path %s exists\n" % options.path)
    
    expTemplate = ExperimentWrapper(ET.parse(options.expFile).getroot())
    configPath = expTemplate.getConfigPath()
    confTemplate = ConfigWrapper(ET.parse(configPath).getroot())
    if options.fixNames:
        cleanEventTree(expTemplate)
    checkInputSequencePaths(expTemplate)
    tree = expTemplate.getTree()

    # Check that the tree is sensible (root has at least 1 child)
    if len(tree.getChildren(tree.getRootId())) == 0:
        raise RuntimeError("Input species tree has only one node.")

    if options.outgroupNames is not None:
        projNames = set([tree.getName(x) for x in tree.getLeaves()])
        options.outgroupNames = set(options.outgroupNames.split(","))
        for outgroupName in options.outgroupNames:
            if outgroupName not in projNames:
                raise RuntimeError("Specified outgroup %s not found in tree" % outgroupName)
    mcProj = createMCProject(tree, expTemplate, confTemplate, options)
    #Replace the sequences with output sequences
    expTemplate.updateTree(mcProj.mcTree, expTemplate.buildSequenceMap())
    expTemplate.setSequences(CactusPreprocessor.getOutputSequenceFiles(mcProj.inputSequences, expTemplate.getOutputSequenceDir()))

    #Now do the file tree creation
    createFileStructure(mcProj, expTemplate, confTemplate, options)
   # mcProj.check()
    return 0
Beispiel #3
0
 def testCactusPreprocessor(self):
     #Demo sequences
     sequenceNames = [ "%s.ENm001.fa" % species for species in 'human', "hedgehog" ]
     sequenceFiles = [ os.path.join(self.encodePath, self.encodeRegion, sequenceName) for sequenceName in sequenceNames ]
     #Make config file
     configFile = os.path.join(self.tempDir, "config.xml")
     rootElem =  ET.Element("preprocessor")
     #<preprocessor chunkSize="10000" proportionToSample="0.2" memory="littleMemory" preprocessorString="cactus_lastzRepeatMask.py --proportionSampled=PROPORTION_SAMPLED --minPeriod=1 --lastzOpts='--step=1 --ambiguous=iupac,100 --ungapped' IN_FILE OUT_FILE "/>
     preprocessor = ET.SubElement(rootElem, "preprocessor")
     preprocessor.attrib["chunkSize"] = "100000"
     preprocessor.attrib["proportionToSample"] = "0.2"
     preprocessor.attrib["preprocessorString"] = "cactus_lastzRepeatMask.py --proportionSampled=PROPORTION_SAMPLED --fragment=200 --minPeriod=1 --lastzOpts='--step=1 --ambiguous=iupac,100 --ungapped' IN_FILE OUT_FILE"
     fileHandle = open(configFile, "w")
     fileHandle.write(ET.tostring(rootElem))
     fileHandle.close()
     #Run preprocessor
     command = "cactus_preprocessor.py %s %s %s --jobTree %s" % (self.tempDir, configFile, " ".join(sequenceFiles), os.path.join(self.tempDir, "jobTree"))
     system(command)
     for sequenceFile, processedSequenceFile in zip(sequenceFiles, CactusPreprocessor.getOutputSequenceFiles(sequenceFiles, self.tempDir)):
         #Parse sequences into dictionary
         originalSequences = getSequences(sequenceFile)
         #Load the new sequences
         processedSequences = getSequences(processedSequenceFile)
         #Check they are the same module masking
         self.checkSequenceSetsEqualModuloSoftMasking(originalSequences, processedSequences)
         
         #Compare the proportion of bases masked by lastz with original repeat masking
         maskedBasesOriginal = getMaskedBases(originalSequences)
         maskedBasesLastzMasked = getMaskedBases(processedSequences)
         #Total bases
         totalBases = sum([ len(i) for i in originalSequences.values() ])
         #Calculate number of hard masked bases
         totalNBases = len([ (header, i, base) for (header, i, base) in maskedBasesOriginal if base.upper() == "N" ])
         
         print " For the sequence file ", sequenceFile, \
          " the total number of sequences is ", len(originalSequences), \
          " the total number of bases ", totalBases, \
          " the number of bases originally masked was: ", len(maskedBasesOriginal),\
          " the number of bases masked after running lastz repeat masking is: ", len(maskedBasesLastzMasked), \
          " the intersection of these masked sets is: ", len(maskedBasesLastzMasked.intersection(maskedBasesOriginal)), \
          " the total number of bases that are Ns ", totalNBases
          
         #Now compare to running lastz on its own
         command = "cactus_lastzRepeatMask.py --proportionSampled=0.2 --minPeriod=1 --lastzOpts='--step=1 --ambiguous=iupac,100 --ungapped --queryhsplimit=keep,nowarn:30' --fragment=200 %s %s" % \
                    (sequenceFile, self.tempOutputFile) 
         popenPush(command, sequenceFile)
         lastzSequencesFast = getSequences(self.tempOutputFile)
         maskedBasesLastzMaskedFast = getMaskedBases(lastzSequencesFast)
         
         i = float(len(maskedBasesLastzMaskedFast.intersection(maskedBasesLastzMasked)))
         print " The number of bases masked after running lastz repeat masking without the preprocessor is: ", len(maskedBasesLastzMaskedFast), \
          " the recall of the fast vs. the new is: ", i/len(maskedBasesLastzMasked), \
          " the precision of the fast vs. the new is: ", i/len(maskedBasesLastzMaskedFast)
 def testCactusPreprocessor(self):
     #Demo sequences
     sequenceNames = [ "%s.ENm001.fa" % species for species in ['human', 'hedgehog'] ]
     sequenceFiles = [ os.path.join(self.encodePath, self.encodeRegion, sequenceName) for sequenceName in sequenceNames ]
     #Make config file
     configFile = os.path.join(self.tempDir, "config.xml")
     rootElem =  ET.Element("preprocessor")
     #<preprocessor chunkSize="10000" proportionToSample="0.2" memory="littleMemory" preprocessorString="cactus_lastzRepeatMask.py --proportionSampled=PROPORTION_SAMPLED --minPeriod=1 --lastzOpts='--step=1 --ambiguous=iupac,100 --ungapped' IN_FILE OUT_FILE "/>
     preprocessor = ET.SubElement(rootElem, "preprocessor")
     preprocessor.attrib["chunkSize"] = "100000"
     preprocessor.attrib["proportionToSample"] = "0.2"
     preprocessor.attrib["preprocessJob"] = "lastzRepeatMask"
     preprocessor.attrib["minPeriod"] = "1"
     preprocessor.attrib["lastzOpts"] = "--step=1 --ambiguous=iupac,100 --ungapped"
     preprocessor.attrib["fragment"] = "200"
     fileHandle = open(configFile, "w")
     fileHandle.write(ET.tostring(rootElem))
     fileHandle.close()
     #Run preprocessor
     tmpToil = os.path.join(self.tempDir, "toil")
     runCactusPreprocessor(outputSequenceDir=self.tempDir, configFile=configFile, inputSequences=sequenceFiles, toilDir=tmpToil)
     
     for sequenceFile, processedSequenceFile in zip(sequenceFiles, CactusPreprocessor.getOutputSequenceFiles(sequenceFiles, self.tempDir)):
         #Parse sequences into dictionary
         originalSequences = getSequences(sequenceFile)
         #Load the new sequences
         processedSequences = getSequences(processedSequenceFile)
         
         #Check they are the same module masking
         self.checkSequenceSetsEqualModuloSoftMasking(originalSequences, processedSequences)
         
         #Compare the proportion of bases masked by lastz with original repeat masking
         maskedBasesOriginal = getMaskedBases(originalSequences)
         maskedBasesLastzMasked = getMaskedBases(processedSequences)
         #Total bases
         totalBases = sum([ len(i) for i in originalSequences.values() ])
         #Calculate number of hard masked bases
         totalNBases = len([ (header, i, base) for (header, i, base) in maskedBasesOriginal if base.upper() == "N" ])
         
         print " For the sequence file ", sequenceFile, \
          " the total number of sequences is ", len(originalSequences), \
          " the total number of bases ", totalBases, \
          " the number of bases originally masked was: ", len(maskedBasesOriginal),\
          " the number of bases masked after running lastz repeat masking is: ", len(maskedBasesLastzMasked), \
          " the intersection of these masked sets is: ", len(maskedBasesLastzMasked.intersection(maskedBasesOriginal)), \
          " the total number of bases that are Ns ", totalNBases
         self.assertGreater(maskedBasesLastzMasked, maskedBasesOriginal)
Beispiel #5
0
 def run(self):
     #Load the multi-cactus project
     project = MultiCactusProject()
     project.readXML(self.args[0])
     #Create jobs to create the output sequences
     configNode = ET.parse(project.getConfigPath()).getroot()
     ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals() #This is necessary..
     #Create the preprocessor
     self.addChildTarget(CactusPreprocessor(project.getInputSequencePaths(), 
                                            CactusPreprocessor.getOutputSequenceFiles(project.getInputSequencePaths(), project.getOutputSequenceDir()),
                                            configNode))
     #Now build the progressive-down target
     schedule = Schedule()
     schedule.loadProject(project)
     schedule.compute()
     if self.options.event == None:
         self.options.event = project.mcTree.getRootName()
     assert self.options.event in project.expMap
     leafNames = [ project.mcTree.getName(i) for i in project.mcTree.getLeaves() ]
     self.options.globalLeafEventSet = set(leafNames)
     self.setFollowOnTarget(ProgressiveDown(self.options, project, self.options.event, schedule))
 def testCactusPreprocessor(self):
     #Demo sequences
     sequenceNames = [ "%s.ENm001.fa" % species for species in ['human', 'hedgehog'] ]
     sequenceFiles = [ os.path.join(self.encodePath, self.encodeRegion, sequenceName) for sequenceName in sequenceNames ]
     #Make config file
     configFile = os.path.join(self.tempDir, "config.xml")
     rootElem =  ET.Element("preprocessor")
     #<preprocessor chunkSize="10000" proportionToSample="0.2" memory="littleMemory" preprocessorString="cactus_lastzRepeatMask.py --proportionSampled=PROPORTION_SAMPLED --minPeriod=1 --lastzOpts='--step=1 --ambiguous=iupac,100 --ungapped' IN_FILE OUT_FILE "/>
     preprocessor = ET.SubElement(rootElem, "preprocessor")
     preprocessor.attrib["chunkSize"] = "100000"
     preprocessor.attrib["proportionToSample"] = "0.2"
     preprocessor.attrib["preprocessJob"] = "lastzRepeatMask"
     preprocessor.attrib["minPeriod"] = "1"
     preprocessor.attrib["lastzOpts"] = "--step=1 --ambiguous=iupac,100 --ungapped"
     preprocessor.attrib["fragment"] = "200"
     fileHandle = open(configFile, "w")
     fileHandle.write(ET.tostring(rootElem))
     fileHandle.close()
     #Run preprocessor
     tmpToil = os.path.join(self.tempDir, "toil")
     runCactusPreprocessor(outputSequenceDir=self.tempDir, configFile=configFile, inputSequences=sequenceFiles, toilDir=tmpToil)
     
     for sequenceFile, processedSequenceFile in zip(sequenceFiles, CactusPreprocessor.getOutputSequenceFiles(sequenceFiles, self.tempDir)):
         print "sequenceFile: %s" % sequenceFile
         print "output sequence file: %s" % processedSequenceFile
         #Parse sequences into dictionary
         originalSequences = getSequences(sequenceFile)
         #Load the new sequences
         processedSequences = getSequences(processedSequenceFile)
         
         #Check they are the same module masking
         self.checkSequenceSetsEqualModuloSoftMasking(originalSequences, processedSequences)
         
         #Compare the proportion of bases masked by lastz with original repeat masking
         maskedBasesOriginal = getMaskedBases(originalSequences)
         maskedBasesLastzMasked = getMaskedBases(processedSequences)
         #Total bases
         totalBases = sum([ len(i) for i in originalSequences.values() ])
         #Calculate number of hard masked bases
         totalNBases = len([ (header, i, base) for (header, i, base) in maskedBasesOriginal if base.upper() == "N" ])
         
         print " For the sequence file ", sequenceFile, \
          " the total number of sequences is ", len(originalSequences), \
          " the total number of bases ", totalBases, \
          " the number of bases originally masked was: ", len(maskedBasesOriginal),\
          " the number of bases masked after running lastz repeat masking is: ", len(maskedBasesLastzMasked), \
          " the intersection of these masked sets is: ", len(maskedBasesLastzMasked.intersection(maskedBasesOriginal)), \
          " the total number of bases that are Ns ", totalNBases
          
         #Now compare to running lastz on its own
         toilOptions = Job.Runner.getDefaultOptions(os.path.join(self.tempDir, "lastzRepeatMaskToil"))
         toilOptions.logLevel = "CRITICAL"
         with Toil(toilOptions) as toil:
             queryID = toil.importFile(makeURL(sequenceFile))
             targetIDs = [queryID]
             repeatMaskedID = toil.start(LastzRepeatMaskJob(queryID=queryID, targetIDs=targetIDs, repeatMaskOptions=RepeatMaskOptions(lastzOpts='--step=1 --ambiguous=iupac,100 --ungapped --queryhsplimit=keep,nowarn:30', minPeriod=1, proportionSampled=0.2, fragment=200)))
             toil.exportFile(repeatMaskedID, makeURL(self.tempOutputFile))
             
         lastzSequencesFast = getSequences(self.tempOutputFile)
         maskedBasesLastzMaskedFast = getMaskedBases(lastzSequencesFast)
         
         i = float(len(maskedBasesLastzMaskedFast.intersection(maskedBasesLastzMasked)))
         print " The number of bases masked after running lastz repeat masking without the preprocessor is: ", len(maskedBasesLastzMaskedFast), \
          " the recall of the fast vs. the new is: ", i/len(maskedBasesLastzMasked), \
          " the precision of the fast vs. the new is: ", i/len(maskedBasesLastzMaskedFast)