Example #1
0
def getCactusWorkflowExperimentForTest(sequences,
                                       newickTreeString,
                                       outputDir,
                                       configFile=None,
                                       constraints=None,
                                       progressive=False,
                                       reconstruct=True):
    """Wrapper to constructor of CactusWorkflowExperiment which additionally incorporates
    any globally set database conf.
    """
    halFile = os.path.join(outputDir, "test.hal")
    fastaFile = os.path.join(outputDir, "test.fa")
    databaseConf = ET.fromstring(
        _GLOBAL_DATABASE_CONF_STRING
    ) if _GLOBAL_DATABASE_CONF_STRING is not None else None
    tree = NXNewick().parseString(newickTreeString, addImpliedRoots=False)
    genomes = [
        tree.getName(id) for id in tree.postOrderTraversal() if tree.isLeaf(id)
    ]
    exp = ExperimentWrapper.createExperimentWrapper(newickTreeString,
                                                    genomes,
                                                    outputDir,
                                                    databaseConf=databaseConf,
                                                    configFile=configFile,
                                                    halFile=halFile,
                                                    fastaFile=fastaFile,
                                                    constraints=constraints,
                                                    progressive=progressive)
    for genome, sequence in zip(genomes, sequences):
        print((genome, sequence))
        exp.setSequenceID(genome, sequence)
    exp.setRootGenome("reference")
    if reconstruct:
        exp.setRootReconstructed(True)
    return exp
Example #2
0
 def run(self):
     # Find all ancestral genomes using the tree.
     newickStr = popenCatch("halStats --tree %s" % self.halFile)
     tree = NXNewick().parseString(newickStr)
     bedFiles = {} # genome => bed files of inserted columns
     for nodeId in tree.postOrderTraversal():
         if len(tree.getChildren(nodeId)) == 0:
             # leaf node, skip
             continue
         assert tree.hasName(nodeId)
         genome = tree.getName(nodeId)
         bedFileForGenome = getTempFile(rootDir=self.getGlobalTempDir())
         bedFiles[genome] = bedFileForGenome
         self.addChildTarget(GetInsertedColumnBed(self.halFile, genome, bedFileForGenome))
     self.setFollowOnTarget(RunAncestorsMLParallel(self.halFile, self.phyloPModel, bedFiles, self.jobsPerGenome, self.threshold))
if __name__ == '__main__':
    # Parse args
    if len(sys.argv) < 3:
        print __doc__
        sys.exit(1)

    newickPath = sys.argv[1]
    fastaPath = sys.argv[2]
    treeString = open(newickPath).read().split("\n")[0].strip()
    tree = NXNewick().parseString(treeString)
    
    sequences = {}
    for name, seq in fastaRead(open(fastaPath)):
        sequences[name] = seq
    
    # Print MAF, with sequence lines in post-order.
    print '##maf version=1 scoring=NA'
    print 'a tree="%s"' % (treeString)
    for nodeId in tree.postOrderTraversal():
        if not tree.isLeaf(nodeId):
            continue
        nodeName = tree.getName(nodeId)
        if nodeName not in sequences:
            raise RuntimeError("The tree has a node %s which was not found in the fasta file" % (nodeName))
        seq = sequences[nodeName]
        seqLen = lengthWithoutGaps(seq)
        print 's %s 0 %d + %d %s' % (nodeName, seqLen, seqLen, seq)
    # mafValidator wants an empty closing line(?)
    print ''