def getCactusWorkflowExperimentForTest(sequences, newickTreeString, outputDir, configFile=None, constraints=None, progressive=False, reconstruct=True): """Wrapper to constructor of CactusWorkflowExperiment which additionally incorporates any globally set database conf. """ halFile = os.path.join(outputDir, "test.hal") fastaFile = os.path.join(outputDir, "test.fa") databaseConf = ET.fromstring( _GLOBAL_DATABASE_CONF_STRING ) if _GLOBAL_DATABASE_CONF_STRING is not None else None tree = NXNewick().parseString(newickTreeString, addImpliedRoots=False) genomes = [ tree.getName(id) for id in tree.postOrderTraversal() if tree.isLeaf(id) ] exp = ExperimentWrapper.createExperimentWrapper(newickTreeString, genomes, outputDir, databaseConf=databaseConf, configFile=configFile, halFile=halFile, fastaFile=fastaFile, constraints=constraints, progressive=progressive) for genome, sequence in zip(genomes, sequences): print((genome, sequence)) exp.setSequenceID(genome, sequence) exp.setRootGenome("reference") if reconstruct: exp.setRootReconstructed(True) return exp
def run(self): # Find all ancestral genomes using the tree. newickStr = popenCatch("halStats --tree %s" % self.halFile) tree = NXNewick().parseString(newickStr) bedFiles = {} # genome => bed files of inserted columns for nodeId in tree.postOrderTraversal(): if len(tree.getChildren(nodeId)) == 0: # leaf node, skip continue assert tree.hasName(nodeId) genome = tree.getName(nodeId) bedFileForGenome = getTempFile(rootDir=self.getGlobalTempDir()) bedFiles[genome] = bedFileForGenome self.addChildTarget(GetInsertedColumnBed(self.halFile, genome, bedFileForGenome)) self.setFollowOnTarget(RunAncestorsMLParallel(self.halFile, self.phyloPModel, bedFiles, self.jobsPerGenome, self.threshold))
if __name__ == '__main__': # Parse args if len(sys.argv) < 3: print __doc__ sys.exit(1) newickPath = sys.argv[1] fastaPath = sys.argv[2] treeString = open(newickPath).read().split("\n")[0].strip() tree = NXNewick().parseString(treeString) sequences = {} for name, seq in fastaRead(open(fastaPath)): sequences[name] = seq # Print MAF, with sequence lines in post-order. print '##maf version=1 scoring=NA' print 'a tree="%s"' % (treeString) for nodeId in tree.postOrderTraversal(): if not tree.isLeaf(nodeId): continue nodeName = tree.getName(nodeId) if nodeName not in sequences: raise RuntimeError("The tree has a node %s which was not found in the fasta file" % (nodeName)) seq = sequences[nodeName] seqLen = lengthWithoutGaps(seq) print 's %s 0 %d + %d %s' % (nodeName, seqLen, seqLen, seq) # mafValidator wants an empty closing line(?) print ''