def makeMaf(inHalPath, outDir, step, overwrite, doMaf): srcHalPath = inHalPath if step > 0: srcHalPath = makePath(inHalPath, outDir, step, "lod", "hal") outMafPath = makePath(inHalPath, outDir, step, "out", "maf") if doMaf and (overwrite or not os.path.isfile(outMafPath)): runShellCommand("hal2maf %s %s" % (srcHalPath, outMafPath))
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("hal", help="input hal") parser.add_argument("outDir", help="output dir") parser.add_argument("--bedName", type=str, default="%%s.bed", help="Name function for output " "bed files where sequence name is specifed as %%s") parser.add_argument("--root", default=None, type=str, help="root") parser.add_argument("--doSnps",action="store_true", default=False) parser.add_argument("--doParentDeletions",action="store_true", default=False) parser.add_argument("--maxGap", default=10, type=int, help="gap threshold") parser.add_argument("--noSort", action="store_true", default=False) args = parser.parse_args() if not os.path.exists(args.outDir): os.makedirs(args.outDir) if not args.noSort: try: runShellCommand("echo \"x\t0\t1\" | sortBed 2> /dev/null") except Exception: print ("Warning: output BED files not sorted because sortBed" + " (BedTools) not found") args.noSort = True getHalTreeMutations(args.hal, args, args.root)
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("hal", help="input hal") parser.add_argument("outDir", help="output dir") parser.add_argument("--bedName", type=str, default="%%s.bed", help="Name function for output " "bed files where sequence name is specifed as %%s") parser.add_argument("--root", default=None, type=str, help="root") parser.add_argument("--doSnps", action="store_true", default=False) parser.add_argument("--doParentDeletions", action="store_true", default=False) parser.add_argument("--maxGap", default=10, type=int, help="gap threshold") parser.add_argument("--noSort", action="store_true", default=False) args = parser.parse_args() if not os.path.exists(args.outDir): os.makedirs(args.outDir) if not args.noSort: try: runShellCommand("echo \"x\t0\t1\" | sortBed 2> /dev/null") except Exception: print("Warning: output BED files not sorted because sortBed" + " (BedTools) not found") args.noSort = True getHalTreeMutations(args.hal, args, args.root)
def computeModel(options): runShellCommand("rm -f %s" % options.outMafAllPaths) extractGeneMAFs(options) computeAgMAFStats(options) computeFit(options) if not options.noModFreqs: modFreqs(options) runShellCommand("rm -f %s" % options.outMafAllPaths)
def computeModel(options): runShellCommand("rm -f %s" % options.outMafAllPaths) extractGeneMAFs(options) computeAgMAFStats(options) computeFit(options) if not options.noModFreqs: modFreqs(options) runShellCommand("rm -f %s" % options.outMafAllPaths)
def timeCmd(cmd): tstart = datetime.now() runShellCommand(cmd) tend = datetime.now() tdelta = tend - tstart tsecs = tdelta.seconds tsecs += tdelta.microseconds / 1000000.0 return tsecs
def computeFit(options): cmd = "phyloFit --tree \"%s\" --subst-mod %s --sym-freqs %s --precision %s --out-root %s" % ( options.tree, options.substMod, options.outMafSS, options.precision, os.path.splitext(options.outMod)[0]) if options.error is not None: cmd += " --error %s " % options.error runShellCommand(cmd) runShellCommand("rm -f %s" % options.outMafSS)
def timeCmd(cmd): tstart = datetime.now() runShellCommand(cmd) tend = datetime.now() tdelta = tend - tstart tsecs = tdelta.seconds tsecs += tdelta.microseconds / 1000000.0 return tsecs
def computeFit(options): cmd = "phyloFit --tree \"%s\" --subst-mod %s --sym-freqs %s --out-root %s" % ( options.tree, options.substMod, options.outMafSS, os.path.splitext(options.outMod)[0]) if options.error is not None: cmd += " --error %s " % options.error runShellCommand(cmd) runShellCommand("rm -f %s" % options.outMafSS)
def getScanTime(inHalPath, outDir, step): srcHalPath = inHalPath if step > 0: srcHalPath = makePath(inHalPath, outDir, step, "lod", "hal") genomes = getHalGenomes(inHalPath) assert len(genomes) > 1 genName = genomes[1] bedPath = makePath(inHalPath, outDir, step, genName, "bed") t1 = time.time() runShellCommand("halBranchMutations %s %s --refFile %s" % (srcHalPath, genName, bedPath)) elapsedTime = time.time() - t1 return [elapsedTime]
def getScanTime(inHalPath, outDir, step): srcHalPath = inHalPath if step > 0: srcHalPath = makePath(inHalPath, outDir, step, "lod", "hal") genomes = getHalGenomes(inHalPath) assert len(genomes) > 1 genName = genomes[1] bedPath = makePath(inHalPath, outDir, step, genName, "bed") t1 = time.time() runShellCommand("halBranchMutations %s %s --refFile %s" % ( srcHalPath, genName, bedPath)) elapsedTime = time.time() - t1 return [elapsedTime]
def computeFit(options): if options.tree is not None: tree = options.tree else: tree = getHalTree(options.hal) cmd = "phyloFit --tree \"%s\" --subst-mod %s --sym-freqs %s --out-root %s" % ( tree, options.substMod, options.outMafSS, os.path.splitext(options.outMod)[0]) if options.error is not None: cmd += " --error %s " % options.error runShellCommand(cmd) runShellCommand("rm -f %s" % options.outMafSS)
def extractGeneMAFs(options): runShellCommand("rm -f %s" % options.outMafAllPaths) for bedFile in options.bedFiles: bedFile4d = (os.path.splitext(options.outMafPath)[0] + "_" + os.path.splitext(os.path.basename(bedFile))[0] + "4d.bed") if not options.no4d: runShellCommand("hal4dExtract %s %s %s %s" % ( options.hal, options.refGenome, bedFile, bedFile4d)) else: runShellCommand("cp %s %s" % (bedFile, bedFile4d)) outMaf = (os.path.splitext(options.outMafPath)[0] + "_" + os.path.splitext(os.path.basename(bedFile4d))[0] + ".maf") h2mFlags = "--noDupes" h2mFlags += " --targetGenomes %s" % options.halGenomes if options.noAncestors is True: h2mFlags += " --noAncestors" runShellCommand("hal2mafMP.py %s %s %s " "--numProc %d --refTargets %s --refGenome %s " % (options.hal, outMaf, h2mFlags,options.numProc, bedFile4d, options.refGenome)) if os.path.exists(bedFile4d): os.remove(bedFile4d) for mafFile in glob.glob(options.outMafAllPaths): if os.path.getsize(mafFile) < 5: os.remove(mafFile) else: remove2ndLine(mafFile) #runShellCommand("msa_view -o SS -z --in-format MAF %s > %s" % ( #mafFile, mafFile.replace(".maf", ".SS"))) if len(glob.glob(options.outMafAllPaths)) < 1: raise RuntimeError("Given BED files do not overlap alignment")
def extractGeneMAFs(options): runShellCommand("rm -f %s" % options.outMafAllPaths) for bedFile in options.bedFiles: bedFile4d = (os.path.splitext(options.outMafPath)[0] + "_" + os.path.splitext(os.path.basename(bedFile))[0] + "4d.bed") if not options.no4d: runShellCommand( "hal4dExtract %s %s %s %s" % (options.hal, options.refGenome, bedFile, bedFile4d)) else: runShellCommand("cp %s %s" % (bedFile, bedFile4d)) outMaf = (os.path.splitext(options.outMafPath)[0] + "_" + os.path.splitext(os.path.basename(bedFile4d))[0] + ".maf") h2mFlags = "--noDupes" h2mFlags += " --targetGenomes %s" % options.halGenomes if options.noAncestors is True: h2mFlags += " --noAncestors" runShellCommand("hal2mafMP.py %s %s %s " "--numProc %d --refTargets %s --refGenome %s " % (options.hal, outMaf, h2mFlags, options.numProc, bedFile4d, options.refGenome)) if os.path.exists(bedFile4d): os.remove(bedFile4d) for mafFile in glob.glob(options.outMafAllPaths): if os.path.getsize(mafFile) < 5: os.remove(mafFile) else: remove2ndLine(mafFile) #runShellCommand("msa_view -o SS -z --in-format MAF %s > %s" % ( #mafFile, mafFile.replace(".maf", ".SS"))) if len(glob.glob(options.outMafAllPaths)) < 1: raise RuntimeError("Given BED files do not overlap alignment")
def splitBed(path, options): numLines = int(runShellCommand("wc -l %s" % path).split()[0]) outPaths = [] outDir = os.path.dirname(options.outMafPath) if options.maxBedLines is not None and numLines > options.maxBedLines: inBed = open(path, "r") curLine = 0 curBed = 0 outPath = path.replace(".bed", "_%d.bed" % curLine) outPaths.append(outPath) outBed = open(outPath, "w") for inLine in inBed: if curLine > options.maxBedLines: curBed += 1 outBed.close() outPath = path.replace(".bed", "_%d.bed" % curBed) outPath = os.path.join(outDir, os.path.basename(outPath)) outPaths.append(outPath) outBed = open(outPath, "w") curLine = 0 else: curLine += 1 outBed.write(inLine) outBed.close() else: outPaths = [path] return outPaths
def splitBed(path, options): numLines = int(runShellCommand("wc -l %s" % path).split()[0]) outPaths = [] outDir = os.path.dirname(options.outMafPath) if options.maxBedLines is not None and numLines > options.maxBedLines: inBed = open(path, "r") curLine = 0 curBed = 0 outPath = path.replace(".bed", "_%d.bed" % curLine) outPaths.append(outPath) outBed = open(outPath, "w") for inLine in inBed: if curLine > options.maxBedLines: curBed += 1 outBed.close() outPath = path.replace(".bed", "_%d.bed" % curBed) outPath = os.path.join(outDir, os.path.basename(outPath)) outPaths.append(outPath) outBed = open(outPath, "w") curLine = 0 else: curLine += 1 outBed.write(inLine) outBed.close() else: outPaths = [path] return outPaths
def computeTreePhyloP(args): visitQueue = [args.root] bigwigCmds = [] while len(visitQueue) > 0: genome = visitQueue.pop() bedFlags = "" # Generate a bed file of all regions of # genome that dont align to parent bedInsertsFile = outFileName(args, genome, "bed", "inserts", True) if genome != args.root: runShellCommand( "halAlignedExtract %s %s --alignedFile %s --complement" % ( args.hal, genome, bedInsertsFile)) bedFlags = "--refBed %s" % bedInsertsFile # Run halPhyloP on the inserts wigFile = outFileName(args, genome, "wig", "phyloP", False) cmd = "halPhyloPMP.py %s %s %s %s --numProc %d %s" % ( args.hal, genome, args.mod, bedFlags, args.numProc, wigFile) if args.subtree is not None: cmd += " --subtree %s" % args.subtree if args.prec is not None: cmd += " --prec %d" % args.prec runShellCommand(cmd) runShellCommand("rm -f %s" % bedInsertsFile) # Lift down from the parent, appending to the wig file computed above if genome != args.root: parent = getHalParentName(args.hal, genome) parentWig = outFileName(args, parent, "wig", "phyloP", False) if os.path.isfile(parentWig): runShellCommand("halWiggleLiftover %s %s %s %s %s --append" % ( args.hal, parent, parentWig, genome, wigFile)) # Convert to bigwig if desired and delete wig file if args.bigWig is True and os.path.isfile(wigFile): sizesFile = outFileName(args, genome, "sizes", "chr", True) bwFile = outFileName(args, genome, "bw", "phyloP", False) bwCmd = "halStats %s --chromSizes %s > %s && " % (args.hal, genome, sizesFile) bwCmd += "wigToBigWig %s %s %s && " % (wigFile, sizesFile, bwFile) bwCmd += "rm -f %s &&" % wigFile bwCmd += "rm -f %s" % sizesFile bigwigCmds.append(bwCmd) # Recurse on children. children = getHalChildrenNames(args.hal, genome) for child in children: visitQueue.append(child) #parallel bigwig conversion runParallelShellCommands(bigwigCmds, args.numProc)
def simulateLoad(options): cmds = [getBlockVizCmd(options, tgtGenome) for tgtGenome in options.tgtGenomes] elapsedTime = 0. for cmd in cmds: lastExcep = None for trial in xrange(options.retry): if options.udc is not None and options.zapUdc is True: runShellCommand("rm -rf %s" % os.path.join(options.udc, "*")) t = -1 try: t = timeCmd(cmd) lastExcep = None break except Exception as e: lastExcep = e time.sleep(2) if lastExcep is None: elapsedTime += t else: raise lastExcep return elapsedTime
def getHalBranchMutations(halPath, genomeName, args): command = "halBranchMutations %s %s --maxGap %s" % (halPath, genomeName, args.maxGap) refBedFile = os.path.join(args.outDir, "%s.bed" % genomeName) dest = refBedFile if not args.noSort: dest = "stdout" command += " --refFile %s" % dest command += " --delBreakFile %s" % dest if args.doSnps: command += " --snpFile %s" % dest if args.doParentDeletions: command += " --parentFile %s" % os.path.join(args.outDir, "%s_pd.bed" % genomeName) if not args.noSort: command += " | sortBed > %s" % refBedFile print command runShellCommand(command)
def getHalBranchMutations(halPath, genomeName, args): command = "halBranchMutations %s %s --maxGap %s" % (halPath, genomeName, args.maxGap) refBedFile = os.path.join(args.outDir, "%s.bed" % genomeName) dest = refBedFile if not args.noSort: dest = "stdout" command += " --refFile %s" % dest command += " --delBreakFile %s" % dest if args.doSnps: command += " --snpFile %s" % dest if args.doParentDeletions: command += " --parentFile %s" % os.path.join(args.outDir, "%s_pd.bed" % genomeName) if not args.noSort: command += " | sortBed > %s" % refBedFile print command runShellCommand(command)
def computeAgMAFStats(options): runShellCommand("msa_view -o SS -z --in-format MAF --aggregate %s %s > %s" % ( options.halGenomes, options.outMafAllPaths, options.outMafSS)) runShellCommand("rm -f %s" % options.outMafAllPaths) runShellCommand("rm -f %s" % options.outMafAllPaths.replace(".maf", ".maf-e"))
def modFreqs(options): baseComp = getHalBaseComposition(options.hal, options.refGenome, 1) runShellCommand("mv %s %s_temp" % (options.outMod, options.outMod)) runShellCommand("modFreqs %s_temp %f %f %f %f > %s" % (options.outMod, baseComp[0], baseComp[1], baseComp[2], baseComp[3], options.outMod)) runShellCommand("rm -f %s_temp" % options.outMod)
def compMaf(inHalPath, outDir, step, overwrite, doMaf): srcMaf = makePath(inHalPath, outDir, 0, "out", "maf") tgtMaf = makePath(inHalPath, outDir, step, "out", "maf") xmlPath = makePath(inHalPath, outDir, step, "comp", "xml") sumPath = makePath(inHalPath, outDir, step, "comp", "txt") if doMaf and (overwrite or not os.path.isfile(xmlPath)): runShellCommand("mafComparator --maf1 %s --maf2 %s --out %s --samples 100000" % (srcMaf, tgtMaf, xmlPath)) runShellCommand("comparatorSummarizer.py --xml %s > %s " % (xmlPath, sumPath)) xmlNearPath = makePath(inHalPath, outDir, step, "comp_near", "xml") sumNearPath = makePath(inHalPath, outDir, step, "comp_near", "txt") if doMaf and (overwrite or not os.path.isfile(xmlNearPath)): runShellCommand( "mafComparator --maf1 %s --maf2 %s --out %s --near %d --samples 100000" % (srcMaf, tgtMaf, xmlNearPath, int(step)) ) runShellCommand("comparatorSummarizer.py --xml %s > %s " % (xmlNearPath, sumNearPath))
def modFreqs(options): baseComp = getHalBaseComposition(options.hal, options.refGenome, 1) runShellCommand("mv %s %s_temp" % (options.outMod, options.outMod)) runShellCommand("modFreqs %s_temp %f %f %f %f > %s" % (options.outMod, baseComp[0], baseComp[1], baseComp[2], baseComp[3], options.outMod)) runShellCommand("rm -f %s_temp" % options.outMod)
def computeAgMAFStats(options): halSpecies = ",".join(options.halGenomes) runShellCommand("msa_view -o SS -z --in-format MAF --aggregate %s %s > %s" % ( halSpecies, options.outMafAllPaths, options.outMafSS)) runShellCommand("rm -f %s" % options.outMafAllPaths) #runShellCommand("rm -f %s" % options.outMafAllPaths.replace(".maf", ".SS")) runShellCommand("rm -f %s" % options.outMafAllPaths.replace(".maf", ".maf-e"))
def compMaf(inHalPath, outDir, step, overwrite, doMaf): srcMaf = makePath(inHalPath, outDir, 0, "out", "maf") tgtMaf = makePath(inHalPath, outDir, step, "out", "maf") xmlPath = makePath(inHalPath, outDir, step, "comp", "xml") sumPath = makePath(inHalPath, outDir, step, "comp", "txt") if doMaf and (overwrite or not os.path.isfile(xmlPath)): runShellCommand("mafComparator --maf1 %s --maf2 %s --out %s --samples 100000" % ( srcMaf, tgtMaf, xmlPath)) runShellCommand("comparatorSummarizer.py --xml %s > %s " % (xmlPath, sumPath)) xmlNearPath = makePath(inHalPath, outDir, step, "comp_near", "xml") sumNearPath = makePath(inHalPath, outDir, step, "comp_near", "txt") if doMaf and (overwrite or not os.path.isfile(xmlNearPath)): runShellCommand( "mafComparator --maf1 %s --maf2 %s --out %s --near %d --samples 100000" % ( srcMaf, tgtMaf, xmlNearPath, int(step))) runShellCommand("comparatorSummarizer.py --xml %s > %s " % ( xmlNearPath, sumNearPath))
def computeAgMAFStats(options): if options.targetGenomes is not None: species = ",".join(options.targetGenomes) else: species = options.halGenomes runShellCommand("msa_view -o SS -z --in-format MAF --aggregate %s %s > %s" % ( species, options.outMafAllPaths, options.outMafSS)) runShellCommand("rm -f %s" % options.outMafAllPaths) runShellCommand("rm -f %s" % options.outMafAllPaths.replace(".maf", ".maf-e"))
def computeAgMAFStats(options): if options.targetGenomes is not None: species = ",".join(options.targetGenomes) else: species = options.halGenomes runShellCommand( "msa_view -o SS -z --in-format MAF --aggregate %s %s > %s" % (species, options.outMafAllPaths, options.outMafSS)) runShellCommand("rm -f %s" % options.outMafAllPaths) runShellCommand("rm -f %s" % options.outMafAllPaths.replace(".maf", ".maf-e"))
def computeMAFStats(options): # make one big maf first = True for mafFile in glob.glob(options.outMafAllPaths): if first is True: first = False runShellCommand("mv %s %s" % (mafFile, options.outMafPath)) else: with open(options.outMafPath, "a") as outMaf: with open(mafFile, "r") as inMaf: for line in inMaf: l = line.lstrip() if len(l) > 0 and l[0] != "#": outMaf.write(line + "\n") runShellCommand("msa_view -o SS --in-format MAF %s > %s" % ( options.outMafPath, options.outMafSS)) runShellCommand("rm -f %s %s" % (options.outMafAllPaths, options.outMafPath))
def computeMAFStats(options): # make one big maf first = True for mafFile in glob.glob(options.outMafAllPaths): if first is True: first = False runShellCommand("mv %s %s" % (mafFile, options.outMafPath)) else: with open(options.outMafPath, "a") as outMaf: with open(mafFile, "r") as inMaf: for line in inMaf: l = line.lstrip() if len(l) > 0 and l[0] != "#": outMaf.write(line + "\n") runShellCommand("msa_view -o SS --in-format MAF %s > %s" % (options.outMafPath, options.outMafSS)) runShellCommand("rm -f %s %s" % (options.outMafAllPaths, options.outMafPath))
def remove2ndLine(path): runShellCommand("sed -i -e 2d %s" % path)
def remove2ndLine(path): runShellCommand("sed -i -e 2d %s" % path)