Ejemplo n.º 1
0
def main():
    parser = ArgumentParser(description=__doc__)
    parser.add_argument('hal', help='hal file')
    parser.add_argument('refGenome', help='reference genome')
    parser.add_argument('halTreeMutationsDir',
                        help='the directory output by halTreeMutations.py')
    parser.add_argument(
        '--targets',
        help='target genomes (comma-separated), default: all leaves')
    parser.add_argument('outputDir',
                        help='output directory for reference beds')
    opts = parser.parse_args()

    # Get the species tree from the hal file.
    newickTree = popenCatch('halStats --tree %s' % (opts.hal))
    tree = NXNewick().parseString(newickTree)

    # Set the target genomes to be all leaves (minus the reference) if not otherwise directed.
    leafGenomes = [tree.getName(x) for x in tree.getLeaves()]
    if opts.refGenome not in leafGenomes:
        raise ValueError("Reference genome %s is not a leaf genome." %
                         opts.refGenome)
    if opts.targets is None:
        opts.targets = [x for x in leafGenomes if x != opts.refGenome]
    else:
        opts.targets = opts.targets.split(',')
        if not all([x in leafGenomes for x in opts.targets]):
            raise ValueError("Some target genomes are not leaves.")

    try:
        os.makedirs(opts.outputDir)
    except:
        if not os.path.isdir(opts.outputDir):
            raise

    for target in opts.targets:
        refID = getTreeID(tree, opts.refGenome)
        targetID = getTreeID(tree, target)
        mrca = getMRCA(tree, refID, targetID)
        pathToTarget = getPath(opts.hal, opts.refGenome, target)
        pathUp, pathDown = [
            list(v) for k, v in groupby(
                pathToTarget, lambda x: x == tree.getName(mrca)) if k != True
        ]
        bedForTarget = os.path.join(opts.outputDir, target + '.bed')
        # First, walk up the tree to the MRCA.
        for curGenome in pathUp:
            liftMutations(opts.halTreeMutationsDir,
                          opts.hal,
                          curGenome,
                          opts.refGenome,
                          bedForTarget,
                          reversePolarity=True)
        # Next, walk down the tree to the target.
        for curGenome in pathDown:
            liftMutations(opts.halTreeMutationsDir, opts.hal, curGenome,
                          opts.refGenome, bedForTarget)
Ejemplo n.º 2
0
def getCactusWorkflowExperimentForTest(sequences,
                                       newickTreeString,
                                       outputDir,
                                       configFile=None,
                                       constraints=None,
                                       progressive=False,
                                       reconstruct=True):
    """Wrapper to constructor of CactusWorkflowExperiment which additionally incorporates
    any globally set database conf.
    """
    halFile = os.path.join(outputDir, "test.hal")
    fastaFile = os.path.join(outputDir, "test.fa")
    databaseConf = ET.fromstring(
        _GLOBAL_DATABASE_CONF_STRING
    ) if _GLOBAL_DATABASE_CONF_STRING is not None else None
    tree = NXNewick().parseString(newickTreeString, addImpliedRoots=False)
    genomes = [
        tree.getName(id) for id in tree.postOrderTraversal() if tree.isLeaf(id)
    ]
    exp = ExperimentWrapper.createExperimentWrapper(newickTreeString,
                                                    genomes,
                                                    outputDir,
                                                    databaseConf=databaseConf,
                                                    configFile=configFile,
                                                    halFile=halFile,
                                                    fastaFile=fastaFile,
                                                    constraints=constraints,
                                                    progressive=progressive)
    for genome, sequence in zip(genomes, sequences):
        print((genome, sequence))
        exp.setSequenceID(genome, sequence)
    exp.setRootGenome("reference")
    if reconstruct:
        exp.setRootReconstructed(True)
    return exp
Ejemplo n.º 3
0
 def run(self):
     # Find all ancestral genomes using the tree.
     newickStr = popenCatch("halStats --tree %s" % self.halFile)
     tree = NXNewick().parseString(newickStr)
     bedFiles = {} # genome => bed files of inserted columns
     for nodeId in tree.postOrderTraversal():
         if len(tree.getChildren(nodeId)) == 0:
             # leaf node, skip
             continue
         assert tree.hasName(nodeId)
         genome = tree.getName(nodeId)
         bedFileForGenome = getTempFile(rootDir=self.getGlobalTempDir())
         bedFiles[genome] = bedFileForGenome
         self.addChildTarget(GetInsertedColumnBed(self.halFile, genome, bedFileForGenome))
     self.setFollowOnTarget(RunAncestorsMLParallel(self.halFile, self.phyloPModel, bedFiles, self.jobsPerGenome, self.threshold))
if __name__ == '__main__':
    # Parse args
    if len(sys.argv) < 3:
        print __doc__
        sys.exit(1)

    newickPath = sys.argv[1]
    fastaPath = sys.argv[2]
    treeString = open(newickPath).read().split("\n")[0].strip()
    tree = NXNewick().parseString(treeString)
    
    sequences = {}
    for name, seq in fastaRead(open(fastaPath)):
        sequences[name] = seq
    
    # Print MAF, with sequence lines in post-order.
    print '##maf version=1 scoring=NA'
    print 'a tree="%s"' % (treeString)
    for nodeId in tree.postOrderTraversal():
        if not tree.isLeaf(nodeId):
            continue
        nodeName = tree.getName(nodeId)
        if nodeName not in sequences:
            raise RuntimeError("The tree has a node %s which was not found in the fasta file" % (nodeName))
        seq = sequences[nodeName]
        seqLen = lengthWithoutGaps(seq)
        print 's %s 0 %d + %d %s' % (nodeName, seqLen, seqLen, seq)
    # mafValidator wants an empty closing line(?)
    print ''