def runCreateMultiCactusProject(expFile, projectFile, fixNames=False, outgroupNames=None, root=None, overwrite=False): options = CreateMultiCactusProjectOptions(expFile, projectFile, fixNames=fixNames, outgroupNames=outgroupNames, root=root, overwrite=overwrite) expTemplate = ExperimentWrapper(ET.parse(options.expFile).getroot()) configPath = expTemplate.getConfigPath() confTemplate = ConfigWrapper(ET.parse(configPath).getroot()) if options.fixNames: cleanEventTree(expTemplate) tree = expTemplate.getTree() if options.outgroupNames is not None: options.outgroupNames = set(options.outgroupNames) projNames = set([tree.getName(x) for x in tree.getLeaves()]) for outgroupName in options.outgroupNames: if outgroupName not in projNames: raise RuntimeError("Specified outgroup %s not found in tree" % outgroupName) mcProj = createMCProject(tree, expTemplate, confTemplate, options) #Replace the sequences with output sequences expTemplate.updateTree(mcProj.mcTree, expTemplate.buildSequenceMap()) #Now do the file tree creation createFileStructure(mcProj, expTemplate, confTemplate, options)
def progressiveFunction(self, experimentFile, toilDir, batchSystem, buildAvgs, buildReference, buildHal, buildFasta, toilStats, subtreeRoot=None): tempDir = getTempDirectory(os.getcwd()) tempExperimentDir = os.path.join(tempDir, "exp") runCreateMultiCactusProject(experimentFile, tempExperimentDir, fixNames=False, root=subtreeRoot) logger.info("Put the temporary files in %s" % tempExperimentDir) runCactusProgressive(os.path.join(tempExperimentDir, "exp_project.xml"), toilDir, batchSystem=batchSystem, buildAvgs=buildAvgs, toilStats=toilStats) # Check that the headers and sequences in the output are the # same as the sequences in the input (minus differences in # repeat-masking) exp = ExperimentWrapper(ET.parse(experimentFile).getroot()) seqMap = exp.buildSequenceMap() # Maps genome name -> headers in fasta headers = {} for genomeName, inputSequencePath in seqMap.items(): if os.path.isdir(inputSequencePath): # Some "input sequence paths" are actually provided as # directories containing multiple FASTAs concatenatedPath = getTempFile() system("cat %s/* > %s" % (inputSequencePath, concatenatedPath)) inputSequencePath = concatenatedPath headers[genomeName] = list(map(itemgetter(0), fastaRead(inputSequencePath))) # check headers inside .c2h output for expPath in glob.glob('%s/*/*_experiment.xml' % (tempExperimentDir)): subExp = ExperimentWrapper(ET.parse(expPath).getroot()) outgroups = subExp.getOutgroupEvents() c2hPath = subExp.getHALPath() with open(c2hPath) as f: for line in f: fields = line.split('\t') if fields[0] == 's': # Sequence line genome = fields[1][1:-1] header = fields[2][1:-1] if genome in headers and genome not in outgroups: # This genome is an input genome self.assertTrue(header in headers[genome], 'Header %s from output c2h %s not found in input fa %s' ' for genome %s' % (header, c2hPath, seqMap[genome], genome)) runToilStatusAndFailIfNotComplete(toilDir) system("rm -rf %s" % tempDir)
def testSequenceMap(self): xmlRoot = self.__makeXmlDummy(self.tree, self.sequences) exp = ExperimentWrapper(xmlRoot) assert NXNewick().writeString(exp.getTree()) == self.tree seqMap = exp.buildSequenceMap() seqList = self.sequences.split() for i in seqList: assert seqMap[os.path.splitext(i)[0].upper()] == i
def testSequenceMap(self): xmlRoot = self.__makeXmlDummy(self.tree, self.sequences) exp = ExperimentWrapper(xmlRoot) assert NXNewick().writeString(exp.getTree()) == self.tree seqMap = exp.buildSequenceMap() seqList = self.sequences.split() for i in seqList: assert seqMap[os.path.splitext(i)[0].upper()] == i
def main(): usage = "usage: %prog [options] <experiment> <output project path>" description = "Setup a multi-cactus project using an experiment xml as template" parser = OptionParser(usage=usage, description=description) parser.add_option("--fixNames", dest="fixNames", default = "True", help="try to make sequence and event names MAF-compliant [default=true]") parser.add_option("--outgroupNames", dest="outgroupNames", default = None, help="comma-separated names of high quality assemblies to use as outgroups [default=everything]") parser.add_option("--root", dest="root", type=str, help="name of alignment root (must be labeled ancestral node in tree in input experiment). Useful " "for allowing the tree to contain nodes that won't be in the alignment but can still be used for " "outgroups.", default=None) parser.add_option("--overwrite", action="store_true", help="Overwrite existing experiment files", default=False) options, args = parser.parse_args() if len(args) != 2: parser.print_help() raise RuntimeError("Wrong number of arguments") options.expFile = args[0] options.path = os.path.abspath(args[1]) options.name = os.path.basename(options.path) options.fixNames = not options.fixNames.lower() == "false" if (os.path.isdir(options.path) and not options.overwrite) or os.path.isfile(options.path): raise RuntimeError("Output project path %s exists\n" % options.path) expTemplate = ExperimentWrapper(ET.parse(options.expFile).getroot()) configPath = expTemplate.getConfigPath() confTemplate = ConfigWrapper(ET.parse(configPath).getroot()) if options.fixNames: cleanEventTree(expTemplate) checkInputSequencePaths(expTemplate) tree = expTemplate.getTree() # Check that the tree is sensible (root has at least 1 child) if len(tree.getChildren(tree.getRootId())) == 0: raise RuntimeError("Input species tree has only one node.") if options.outgroupNames is not None: projNames = set([tree.getName(x) for x in tree.getLeaves()]) options.outgroupNames = set(options.outgroupNames.split(",")) for outgroupName in options.outgroupNames: if outgroupName not in projNames: raise RuntimeError("Specified outgroup %s not found in tree" % outgroupName) mcProj = createMCProject(tree, expTemplate, confTemplate, options) #Replace the sequences with output sequences expTemplate.updateTree(mcProj.mcTree, expTemplate.buildSequenceMap()) expTemplate.setSequences(CactusPreprocessor.getOutputSequenceFiles(mcProj.inputSequences, expTemplate.getOutputSequenceDir())) #Now do the file tree creation createFileStructure(mcProj, expTemplate, confTemplate, options) # mcProj.check() return 0
def testOutgroups(self): xmlRoot = self.__makeXmlDummy(self.tree, self.sequences) exp = ExperimentWrapper(xmlRoot) assert NXNewick().writeString(exp.getTree()) == self.tree exp.addOutgroupSequence("outgroup", 1.3, "outgroup.fa") exp.addOutgroupSequence("outgroup2", 2.6, "outgroup2.fa") assert exp.getOutgroupEvents() == ["outgroup", "outgroup2"] seqMap = exp.buildSequenceMap() assert "outgroup" in seqMap assert seqMap["outgroup"] == "outgroup.fa" assert "outgroup2" in seqMap assert seqMap["outgroup2"] == "outgroup2.fa"
def progressiveFunction(self, experimentFile, toilDir, batchSystem, buildAvgs, buildReference, buildHal, buildFasta, toilStats, subtreeRoot=None): tempDir = getTempDirectory(os.getcwd()) tempExperimentDir = os.path.join(tempDir, "exp") runCreateMultiCactusProject(experimentFile, tempExperimentDir, fixNames=False, root=subtreeRoot) logger.info("Put the temporary files in %s" % tempExperimentDir) runCactusProgressive(os.path.join(tempExperimentDir, "exp_project.xml"), toilDir, batchSystem=batchSystem, buildAvgs=buildAvgs, toilStats=toilStats) # Check that the headers and sequences in the output are the # same as the sequences in the input (minus differences in # repeat-masking) exp = ExperimentWrapper(ET.parse(experimentFile).getroot()) seqMap = exp.buildSequenceMap() # Maps genome name -> headers in fasta headers = {} for genomeName, inputSequencePath in seqMap.items(): if os.path.isdir(inputSequencePath): # Some "input sequence paths" are actually provided as # directories containing multiple FASTAs concatenatedPath = getTempFile() system("cat %s/* > %s" % (inputSequencePath, concatenatedPath)) inputSequencePath = concatenatedPath headers[genomeName] = list( map(itemgetter(0), fastaRead(inputSequencePath))) # check headers inside .c2h output for expPath in glob.glob('%s/*/*_experiment.xml' % (tempExperimentDir)): subExp = ExperimentWrapper(ET.parse(expPath).getroot()) outgroups = subExp.getOutgroupEvents() c2hPath = subExp.getHALPath() with open(c2hPath) as f: for line in f: fields = line.split('\t') if fields[0] == 's': # Sequence line genome = fields[1][1:-1] header = fields[2][1:-1] if genome in headers and genome not in outgroups: # This genome is an input genome self.assertTrue( header in headers[genome], 'Header %s from output c2h %s not found in input fa %s' ' for genome %s' % (header, c2hPath, seqMap[genome], genome)) runToilStatusAndFailIfNotComplete(toilDir) system("rm -rf %s" % tempDir)