def main(): parser = ArgumentParser(description=__doc__) parser.add_argument('hal', help='hal file') parser.add_argument('refGenome', help='reference genome') parser.add_argument('halTreeMutationsDir', help='the directory output by halTreeMutations.py') parser.add_argument( '--targets', help='target genomes (comma-separated), default: all leaves') parser.add_argument('outputDir', help='output directory for reference beds') opts = parser.parse_args() # Get the species tree from the hal file. newickTree = popenCatch('halStats --tree %s' % (opts.hal)) tree = NXNewick().parseString(newickTree) # Set the target genomes to be all leaves (minus the reference) if not otherwise directed. leafGenomes = [tree.getName(x) for x in tree.getLeaves()] if opts.refGenome not in leafGenomes: raise ValueError("Reference genome %s is not a leaf genome." % opts.refGenome) if opts.targets is None: opts.targets = [x for x in leafGenomes if x != opts.refGenome] else: opts.targets = opts.targets.split(',') if not all([x in leafGenomes for x in opts.targets]): raise ValueError("Some target genomes are not leaves.") try: os.makedirs(opts.outputDir) except: if not os.path.isdir(opts.outputDir): raise for target in opts.targets: refID = getTreeID(tree, opts.refGenome) targetID = getTreeID(tree, target) mrca = getMRCA(tree, refID, targetID) pathToTarget = getPath(opts.hal, opts.refGenome, target) pathUp, pathDown = [ list(v) for k, v in groupby( pathToTarget, lambda x: x == tree.getName(mrca)) if k != True ] bedForTarget = os.path.join(opts.outputDir, target + '.bed') # First, walk up the tree to the MRCA. for curGenome in pathUp: liftMutations(opts.halTreeMutationsDir, opts.hal, curGenome, opts.refGenome, bedForTarget, reversePolarity=True) # Next, walk down the tree to the target. for curGenome in pathDown: liftMutations(opts.halTreeMutationsDir, opts.hal, curGenome, opts.refGenome, bedForTarget)
def getCactusWorkflowExperimentForTest(sequences, newickTreeString, outputDir, configFile=None, constraints=None, progressive=False, reconstruct=True): """Wrapper to constructor of CactusWorkflowExperiment which additionally incorporates any globally set database conf. """ halFile = os.path.join(outputDir, "test.hal") fastaFile = os.path.join(outputDir, "test.fa") databaseConf = ET.fromstring( _GLOBAL_DATABASE_CONF_STRING ) if _GLOBAL_DATABASE_CONF_STRING is not None else None tree = NXNewick().parseString(newickTreeString, addImpliedRoots=False) genomes = [ tree.getName(id) for id in tree.postOrderTraversal() if tree.isLeaf(id) ] exp = ExperimentWrapper.createExperimentWrapper(newickTreeString, genomes, outputDir, databaseConf=databaseConf, configFile=configFile, halFile=halFile, fastaFile=fastaFile, constraints=constraints, progressive=progressive) for genome, sequence in zip(genomes, sequences): print((genome, sequence)) exp.setSequenceID(genome, sequence) exp.setRootGenome("reference") if reconstruct: exp.setRootReconstructed(True) return exp
def run(self): # Find all ancestral genomes using the tree. newickStr = popenCatch("halStats --tree %s" % self.halFile) tree = NXNewick().parseString(newickStr) bedFiles = {} # genome => bed files of inserted columns for nodeId in tree.postOrderTraversal(): if len(tree.getChildren(nodeId)) == 0: # leaf node, skip continue assert tree.hasName(nodeId) genome = tree.getName(nodeId) bedFileForGenome = getTempFile(rootDir=self.getGlobalTempDir()) bedFiles[genome] = bedFileForGenome self.addChildTarget(GetInsertedColumnBed(self.halFile, genome, bedFileForGenome)) self.setFollowOnTarget(RunAncestorsMLParallel(self.halFile, self.phyloPModel, bedFiles, self.jobsPerGenome, self.threshold))
if __name__ == '__main__': # Parse args if len(sys.argv) < 3: print __doc__ sys.exit(1) newickPath = sys.argv[1] fastaPath = sys.argv[2] treeString = open(newickPath).read().split("\n")[0].strip() tree = NXNewick().parseString(treeString) sequences = {} for name, seq in fastaRead(open(fastaPath)): sequences[name] = seq # Print MAF, with sequence lines in post-order. print '##maf version=1 scoring=NA' print 'a tree="%s"' % (treeString) for nodeId in tree.postOrderTraversal(): if not tree.isLeaf(nodeId): continue nodeName = tree.getName(nodeId) if nodeName not in sequences: raise RuntimeError("The tree has a node %s which was not found in the fasta file" % (nodeName)) seq = sequences[nodeName] seqLen = lengthWithoutGaps(seq) print 's %s 0 %d + %d %s' % (nodeName, seqLen, seqLen, seq) # mafValidator wants an empty closing line(?) print ''