def main(): """Inputs: -- Txt table file containing some tab-delim file path inputs for files.filter_PEfastQs() PE_FastqPathFwd<tab>PE_FastqPathRev<tab>OutputComboBaseName<newLine> -- String representing a lambda func to act as filter for fastqRecs Outputs: -- Writes filtered data to paths specified in the input txt table file""" desc = """This script filters paired fastq files based on a provided lambda filter logic. Input and output paths are determined by the input table file.""" parser = argparse.ArgumentParser(description=desc) parser.add_argument('input_table', type=str, help="""Path to input table file.""") parser.add_argument('filter_func', type=str, help="""lambda filter function (quoted string).""") parser.add_argument('out_dir', type=str, help="""Path to out directory.""") args = parser.parse_args() # print the called command: sys.stderr.write("%s\n" % (" ".join(sys.argv))) # open and parse the input table file inputs = [x.strip('\n').split('\t') for x in open(args.input_table,'rU')] for i in inputs: if len(i) != 3: raise InvalidFileFormatError("At least one line in %s does not have exactly three columns:\n%s\n" % (args.input_table,i)) # create out_dir if needed mkdirp(args.out_dir) # set up and unleash the subprocesses filtFunc = eval(args.filter_func,{"__builtins__":None}) jobs = [] for i in inputs: arguments = [filtFunc, i[0], i[1]] # build the output file names (matchedPassPath1,matchedPassPath2,singlePassPath,nonPassPath) from baseNames arguments.append("%s/%s.filtered.mated.fastq" % (args.out_dir.rstrip('/'),i[0].split('/')[-1].split('.fastq')[0])) # matchedPassPath1 arguments.append("%s/%s.filtered.mated.fastq" % (args.out_dir.rstrip('/'),i[1].split('/')[-1].split('.fastq')[0])) # matchedPassPath2 arguments.append("%s/%s.filtered.singled.fastq" % (args.out_dir.rstrip('/'),i[2].split('/')[-1].split('.fastq')[0])) # singlePassPath arguments.append("%s/%s.filtered.failed.fastq" % (args.out_dir.rstrip('/'),i[2].split('/')[-1].split('.fastq')[0])) # nonPassPath p = mp.Process(target=filter_PEfastQs,args=tuple(arguments)) jobs.append(p) p.start() return jobs
def bowtie_index(reference_in,ebwt_outfile_base,runDir,options=None): """Create bowtie indexes from new fasta set. options : quoted string representing valid cmd line bowtie-build options runDir : path to dir to place stdErr/stdOut logs - all steps of pipeline scripts should share same runDir ---------- bowtie-build help text: Usage: bowtie-build [options]* <reference_in> <ebwt_outfile_base> reference_in comma-separated list of files with ref sequences ebwt_outfile_base write Ebwt data to files with this dir/basename Options: -f reference files are Fasta (default) -c reference sequences given on cmd line (as <seq_in>) -C/--color build a colorspace index -a/--noauto disable automatic -p/--bmax/--dcv memory-fitting -p/--packed use packed strings internally; slower, uses less mem -B build both letter- and colorspace indexes --bmax <int> max bucket sz for blockwise suffix-array builder --bmaxdivn <int> max bucket sz as divisor of ref len (default: 4) --dcv <int> diff-cover period for blockwise (default: 1024) --nodc disable diff-cover (algorithm becomes quadratic) -r/--noref don't build .3/.4.ebwt (packed reference) portion -3/--justref just build .3/.4.ebwt (packed reference) portion -o/--offrate <int> SA is sampled every 2^offRate BWT chars (default: 5) -t/--ftabchars <int> # of chars consumed in initial lookup (default: 10) --ntoa convert Ns in reference to As --seed <int> seed for random number generator -q/--quiet verbose output (for debugging) -h/--help print detailed description of tool and its options --usage print this usage message --version print version information and quit """ # make runDir if it does not yet exist print "creating: %s" % (runDir) mkdirp(runDir) # Construct cmdArgs if options: cmdArgs = "%s %s %s" % (options,reference_in,ebwt_outfile_base) # Run bowtie-build and capture output btBuildResults = runExternalApp('bowtie-build',cmdArgs) # Report bowtie-build stdout and stderr if btBuildResults[0]: print('[%s] %s' % (whoami(),btBuildResults[0])) if btBuildResults[1]: sys.stderr('[%s] %s' % (whoami(),btBuildResults[1])) return btBuildResults
def init_dir_structure(spcName,versionID,baseDir,isCurrent=False): """Ensure that the correct directory structure exists to accept the genome and index data files. Create anything that does not already exist. If isCurrent != False: soft link the versionID dir as "current" Example args: spcName = 'aedes_aegypti' versionID = 'release_7' baseDir = '/home/data' """ spcVerDir = '%s/genomes/%s/%s' % (baseDir,spcName,versionID) fastas = '%s/fasta/' % (spcVerDir) annotations = '%s/annotations/' % (spcVerDir) mysql = '%s/mysql/' % (spcVerDir) indexes = '%s/indexes/' % (baseDir) print "creating dir: %s" % (fastas) mkdirp(fastas) print "creating dir: %s" % (annotations) mkdirp(annotations) print "creating dir: %s" % (mysql) mkdirp(mysql) print "creating dir: %s" % (indexes) mkdirp(indexes) if isCurrent: print "creating sym link from %s to 'current' because <isCurrent> is set to True" % (spcVerDir) os.symlink(spcVerDir, '%s/genomes/%s/current' % (baseDir,spcName))
def extractFromDoubleSidedBedtoolOut(filePath, cols, side="right", outDir="."): """Creates new file from filePath using only the bedInfo from the left/right (based on 'side') side of a BEDtools outFile with double- sided output (side=[3,6]). 'cols' must be a list with length of columns in each 'side' of the double output. 'side' = keep the 'right' or 'left' side of the output line.""" # Prepare outDir if it doesnt already exist mkdirp(outDir) inFile = open(filePath, "rU") outFilePath = "%s/%s_%s.bed" % (outDir, filePath.replace(".bed", "").split("/")[-1], side) outFile = open(outFilePath, "w") lineNum = 0 for line in inFile: lineNum += 1 if line.startswith("track"): continue line = line.strip("\n").split("\t") # Divide the line into two based on cols divLine = line[: cols[0]], line[-(len(line) - cols[0]) :] # Ensure the length of each new line is what we expect, then write out cleaned line if not ((len(divLine[0]) == cols[0]) and (len(divLine[1]) == cols[1])): raise InvalidFileFormatError( 'line %s in file %s has unexpected number of columns or the values in "cols" is incorrect.' % (lineNum, filePath) ) else: if side == "right": outFile.write("%s\n" % ("\t".join(divLine[1]))) elif side == "left": outFile.write("%s\n" % ("\t".join(divLine[0]))) else: raise InvalidOptionError('option "side" must be one of %s. Was: %s.' % (["right", "left"], side)) outFile.close() # Sort and remove redundancy from line in new file resultSort = runExternalApp("sort", "-u %s > %s.tmp" % (outFilePath, outFilePath)) resultMv = runExternalApp("mv", "%s.tmp %s" % (outFilePath, outFilePath)) return outFilePath
def runSCOPE(pLen,genes,jobName,scopeDir,outDir,paramName,jMem='2000',verbose=False): """Perform a SCOPE run. Complain and quit if error occurs. Notes: scopeDir = full path. genes = 'gene;gene;gene;etc' pLen = promorter length to use.""" # Get full path (if not given) for outDir since we will be jumping around in the directory tree if not outDir.startswith('/'): outDir = os.getcwd()+'/'+outDir outDir = outDir.rstrip('/') else: outDir = outDir.rstrip('/') # Set up argString outPathBase = '%s/%s.%s' % (outDir,jobName,pLen) argString = '''-Xmx%sm -cp dist/scope.jar edu.dartmouth.bglab.beam.CGIScope -pf "%s" -ofx "%s.xml" -oft "%s.txt" -oje "%s" -qg "%s" -sgl "%s" -drb "true" -dra "true" -drbp "true"''' \ % (jMem, paramName, outPathBase, outPathBase, jobName,genes,pLen) # Change to scopeDir for execution bc SCOPE is a PITA. os.chdir(scopeDir) mkdirp(outDir) # make outDir along with parent dirs as needed print 'starting run...' resultSCOPE = runExternalApp('java',argString) # write stdOut/Err to files if requested if verbose: stdOutFile = open(outPathBase+'.out','w') stdErrFile = open(outPathBase+'.err','w') stdOutFile.write(resultSCOPE[0]) stdErrFile.write(resultSCOPE[1]) stdOutFile.close() stdErrFile.close() return resultSCOPE
def plotScatter(pearsonStats,normedTxCntsList,opts): """""" fig = pl.figure() ax = fig.add_subplot(111) if opts.log: ax.set_xscale('log') ax.set_yscale('log') ax.scatter(normedTxCntsList[0],normedTxCntsList[1], s=15, c='b', marker='o', alpha=1) if not opts.log: ax.set_autoscale_on(False) ax.set_xlabel(opts.name_a) ax.set_ylabel(opts.name_b) upperLim = max(normedTxCntsList[0]+normedTxCntsList[1]) m,b = pl.polyfit(normedTxCntsList[0],normedTxCntsList[1],1) bfYs = pl.polyval([m,b], [1,max(normedTxCntsList[0])]) ax.plot([1,max(normedTxCntsList[0])],bfYs,'r-') pl.text(0.01,0.99,'Pearson: %.4f, %s\nBest Fit: y=%.3f*x+%.3f' % (pearsonStats[0],pearsonStats[1],m,b), bbox=dict(facecolor='#87AACD', alpha=1), horizontalalignment='left', verticalalignment='top', transform = ax.transAxes) mkdirp(opts.dir) if not opts.log: pl.savefig('%s%s_vs_%s.png' % (opts.dir,opts.name_a,opts.name_b)) else: pl.savefig('%s%s_vs_%s.log.png' % (opts.dir,opts.name_a,opts.name_b)) print 'Show? %s' % (opts.show) if opts.show: pl.show()
def main(): #+++++++++++ File Parseing Etc +++++++++++ desc = """Calls the folowing funcs: 'add this'""" usage = """python %prog args""" parser = optparse.OptionParser(usage=usage, description=desc) parser.add_option('--motifs', type='str',default=None, help="""Path to motif file (default=%default).""") parser.add_option('--motif-type', type='str',default='scope', help="""Format of motif file (default=%default).""") parser.add_option('--thresh', type='float',default=0.005, help="""P-value threshold for motif score cut-off (default=%default).""") parser.add_option('--promoters', type='str',default=None, help="""Path to fasta file with promoter population (default=%default).""") parser.add_option('--plen', type='int',default=1000, help="""Max promoter length to use -- starting from 3'-end!! (default=%default).""") parser.add_option('--genes', type='string',default=None, help="""Unbroken string of gene/Tx names representing the true forground set, sep=','. Exp: 'gene,gene,gene' (default=%default).""") parser.add_option('--job', type='string',default='int(time())', help="""String to identify this run (default=%default).""") parser.add_option('--out', type='string',default=None, help="""Path to results dir -- must use full path (default=%default).""") parser.add_option('--plot-fdrs', dest="fdrs", type='int',default=0, help="""How many random sets of genes equal in length to --genes to run for FDR estimation. (default=%default).""") parser.add_option('--from-possum',default=False, help="""Path to possumSearch outFile, skips motif finding step. (default=%default)""") parser.add_option('--verbose',action='store_true',default=False, help="""Include if stdOut/stdErr is desired. (default=%default).""") parser.add_option('--check-seqs',action='store_true',default=False, help="""Print info about promoter sequences and exit. (default=%default).""") parser.add_option('--expect',action='store_true',default=False, help="""Use median seqLen to set success threshold at greater than the estimated expected number of occurences in each promoter [pValThresh*searchesPerSeq]. (default=%default).""") (opts, args) = parser.parse_args() # +++++ Argument Validations +++++ if len(sys.argv) == 1: parser.print_help() exit(0) if not opts.out: raise InvalidOptionError("--out argument is required.") if not opts.genes: raise InvalidOptionError('--genes argument is required.') if opts.job == 'int(time())': opts.job = int(time()) if not opts.from_possum: if not opts.motifs or opts.motif_type or opts.thresh or opts.promoters or opts.plen: raise InvalidOptionError("""Unless --from-possum, the following argument are ALL required: --motif-type --thresh --plen""") else: if not opts.motifs: raise InvalidOptionError("When using --from-possum, --motifs should be the PSSM file used to generate this particular hit set.") # +++++ Lets Begin +++++ if opts.verbose: sys.stdout.write('\n%s\n\n' % (' '.join(sys.argv))) if opts.verbose: sys.stdout.write('preparing out directory...\n') outBaseStr = getOutBaseStr(opts.out,opts.job) mkdirp(opts.out) if opts.verbose: sys.stdout.write('building seqDict...\n') seqDict = getSeqs(opts.promoters,opts.plen) if opts.check_seqs: seqStats(seqDict,show=True) exit(0) else: seqData = seqStats(seqDict,show=False) if opts.expect: opts.expect = seqData['medLen']*opts.thresh*2 else: opts.expect = 0 # --- am i doing the searching myself? --- if not opts.from_possum: # -- yes -- if opts.verbose: sys.stdout.write('building motifList...\n') motifList = getMotifs(opts.motifs,opts.motif_type) if opts.verbose: sys.stdout.write('getting nucFreqs...\n') halfAT,halfGC = getSeqFreqs(seqDict) if opts.verbose: sys.stdout.write('building hitDict...\n') motifHits = getEvalHitDict(motifList,seqDict,pThresh=opts.thresh,halfAT=halfAT,halfGC=halfGC) else: # -- Oh thank god, no! All I have to do is some parseing! -- if opts.verbose: sys.stdout.write('skipping to building hitDict step...\n') pACs = getPossumProfileACs(opts.motifs) possumTable = getPossumHitTable(opts.from_possum,headers=possumHeaders) motifHits = getPossumHitDict(possumTable,seqDict.keys(),pACs) motifList = makeMotifListFromPossum(pACs) # create list of DummyPlug MotifObjs for compatibility if opts.verbose: sys.stdout.write('getting forgroundSeqs...\n') foregroundSeqs = getForeground(opts.genes) outData = {} if opts.verbose: sys.stdout.write('calculating real data p-values...\n') realData = motifHyprGeoEnrichment(motifList,motifHits,foregroundSeqs,opts.expect) appendData(realData,outData) for i in range(opts.fdrs): if opts.verbose: sys.stdout.write('ctrl_%s:\n' % (i)) if opts.verbose: sys.stdout.write('\tgetting random forground...\n') rForground = getRandomForground(foregroundSeqs,seqDict) if opts.verbose: sys.stdout.write('\tcalculating p-values...\n') ctrlData = motifHyprGeoEnrichment(motifList,motifHits,rForground,opts.expect) appendData(ctrlData,outData) if opts.verbose: sys.stdout.write('writting outData...\n') writeDataTable(outBaseStr,outData,motifList) if opts.verbose: sys.stdout.write('plotting histograms...\n') for m in motifList: plotHist(outBaseStr,outData,m.id)
def bowtie_align(ebwt,readsString,hit,runDir,options=None): """Run alignment of fastQ to bowtie index. options : quoted string representing valid cmd line bowtie-build options runDir : path to dir to place stdErr/stdOut logs - all steps of pipeline scripts should share same runDir readsString : appropriate quoted string representing which fastq files to use (see bowtie -h). ---------- bowtie help text: Usage: bowtie [options]* <ebwt> {-1 <m1> -2 <m2> | --12 <r> | <s>} [<hit>] <m1> Comma-separated list of files containing upstream mates (or the sequences themselves, if -c is set) paired with mates in <m2> <m2> Comma-separated list of files containing downstream mates (or the sequences themselves if -c is set) paired with mates in <m1> <r> Comma-separated list of files containing Crossbow-style reads. Can be a mixture of paired and unpaired. Specify "-" for stdin. <s> Comma-separated list of files containing unpaired reads, or the sequences themselves, if -c is set. Specify "-" for stdin. <hit> File to write hits to (default: stdout) Input: -q query input files are FASTQ .fq/.fastq (default) -f query input files are (multi-)FASTA .fa/.mfa -r query input files are raw one-sequence-per-line -c query sequences given on cmd line (as <mates>, <singles>) -C reads and index are in colorspace -Q/--quals <file> QV file(s) corresponding to CSFASTA inputs; use with -f -C --Q1/--Q2 <file> same as -Q, but for mate files 1 and 2 respectively -s/--skip <int> skip the first <int> reads/pairs in the input -u/--qupto <int> stop after first <int> reads/pairs (excl. skipped reads) -5/--trim5 <int> trim <int> bases from 5' (left) end of reads -3/--trim3 <int> trim <int> bases from 3' (right) end of reads --phred33-quals input quals are Phred+33 (default) --phred64-quals input quals are Phred+64 (same as --solexa1.3-quals) --solexa-quals input quals are from GA Pipeline ver. < 1.3 --solexa1.3-quals input quals are from GA Pipeline ver. >= 1.3 --integer-quals qualities are given as space-separated integers (not ASCII) Alignment: -v <int> report end-to-end hits w/ <=v mismatches; ignore qualities or -n/--seedmms <int> max mismatches in seed (can be 0-3, default: -n 2) -e/--maqerr <int> max sum of mismatch quals across alignment for -n (def: 70) -l/--seedlen <int> seed length for -n (default: 28) --nomaqround disable Maq-like quality rounding for -n (nearest 10 <= 30) -I/--minins <int> minimum insert size for paired-end alignment (default: 0) -X/--maxins <int> maximum insert size for paired-end alignment (default: 250) --fr/--rf/--ff -1, -2 mates align fw/rev, rev/fw, fw/fw (default: --fr) --nofw/--norc do not align to forward/reverse-complement reference strand --maxbts <int> max # backtracks for -n 2/3 (default: 125, 800 for --best) --pairtries <int> max # attempts to find mate for anchor hit (default: 100) -y/--tryhard try hard to find valid alignments, at the expense of speed --chunkmbs <int> max megabytes of RAM for best-first search frames (def: 64) Reporting: -k <int> report up to <int> good alignments per read (default: 1) -a/--all report all alignments per read (much slower than low -k) -m <int> suppress all alignments if > <int> exist (def: no limit) -M <int> like -m, but reports 1 random hit (MAPQ=0); requires --best --best hits guaranteed best stratum; ties broken by quality --strata hits in sub-optimal strata aren't reported (requires --best) Output: -t/--time print wall-clock time taken by search phases -B/--offbase <int> leftmost ref offset = <int> in bowtie output (default: 0) --quiet print nothing but the alignments --refout write alignments to files refXXXXX.map, 1 map per reference --refidx refer to ref. seqs by 0-based index rather than name --al <fname> write aligned reads/pairs to file(s) <fname> --un <fname> write unaligned reads/pairs to file(s) <fname> --max <fname> write reads/pairs over -m limit to file(s) <fname> --suppress <cols> suppresses given columns (comma-delim'ed) in default output --fullref write entire ref name (default: only up to 1st space) Colorspace: --snpphred <int> Phred penalty for SNP when decoding colorspace (def: 30) or --snpfrac <dec> approx. fraction of SNP bases (e.g. 0.001); sets --snpphred --col-cseq print aligned colorspace seqs as colors, not decoded bases --col-cqual print original colorspace quals, not decoded quals --col-keepends keep nucleotides at extreme ends of decoded alignment SAM: -S/--sam write hits in SAM format --mapq <int> default mapping quality (MAPQ) to print for SAM alignments --sam-nohead supppress header lines (starting with @) for SAM output --sam-nosq supppress @SQ header lines for SAM output --sam-RG <text> add <text> (usually "lab=value") to @RG line of SAM header Performance: -o/--offrate <int> override offrate of index; must be >= index's offrate -p/--threads <int> number of alignment threads to launch (default: 1) --mm use memory-mapped I/O for index; many 'bowtie's can share --shmem use shared mem for index; many 'bowtie's can share Other: --seed <int> seed for random number generator --verbose verbose output (for debugging) --version print version information and quit -h/--help print this usage message """ # make runDir if it does not yet exist mkdirp(runDir) # Construct cmdArgs if options: cmdArgs = "%s %s %s %s" % (options,ebwt,readsString,hit) else: cmdArgs = "%s %s %s" % (ebwt,readsString,hit) # Run and capture output print "Setting up bowtie call with the following cmd:\n\t\tbowtie %s" % (cmdArgs) btResults = runExternalApp('bowtie',cmdArgs) # Report stdout and stderr if btResults[0]: for line in btResults[0].split('\n'): print('[%s] %s' % (whoami(),line)) if btResults[1]: for line in btResults[1].split('\n'): sys.stderr.write('[%s] %s' % (whoami(),line)) return btResults
help="""Directory path for output (default=%default).""") parser.add_option('--track', dest="track", type='string',default='untitled', help="""Unbroken string to use for track name (default=%default).""") parser.add_option('--description', dest="description", type='string',default='none given', help="""Quoted string for description of track (default=%default).""") parser.add_option('--rgb', dest="rgb", type='string',default='0,0,0', help="""Unbroken comma separated string to denote color of track (default=%default).""") (opts, args) = parser.parse_args() if len(sys.argv) == 1: parser.print_help() exit(0) if opts.out.endswith('/'): opts.out = opts.out[:-1] if not len(opts.rgb.split(',')): raise InvalidOptionError('Malformed --rgb option: %s.' % (opts.rgb)) # Prepare outdir mkdirp(opts.out) outFile = open('%s/%s' % (opts.out,args[0].split('/')[-1].replace('.txt', '.bed')), 'w') outFile.write('track name=%s description="%s" useScore=0\n' % (opts.track,opts.description)) # For every line starting "cigar:" convert to bed and write out for line in open(args[0],'rU'): if line.startswith('cigar:'): bedLine = exonerateCigar2BEDline(line,opts.rgb) outFile.write(bedLine) outFile.close()
def main(): """ 1: Collect Tx from one or more species that are within at least some r value of similarity to a provided example Tx or a submitted hypothetical expression vector. 2: Use GTFs, BEDtools, and genome FASTAs to extract the upstream flanking sequences into a new FASTA for use in motif discovery. """ desc = """(1) Collect Tx from one or more species that are within at least some r value of similarity to a provided example Tx or a submitted hypothetical expression vector. (2) Use GTFs, BEDtools, and genome FASTAs to extract the upstream flanking sequences into a new FASTA for use in motif discovery.""" parser = argparse.ArgumentParser(description=desc) FileType = argparse.FileType logger = logging.getLogger(sys.argv[0].split('/')[-1]) parser.add_argument('--expn-path', type=str, required=True, help="""Path to expression table file. \n(default: %(default)s)""") parser.add_argument('--tx-name', type=str, required=True, help="""Name of the Tx you want to use as a model. (default: %(default)s)""") parser.add_argument('--pearson-filter-type', type=str, default='>=', choices=['>=','<='], help="""Use >= to find similar expn profiles or <= to find opposite profiles. (default: %(default)s)""") parser.add_argument('--pearson-filter-thresh', type=float, default=0.7, help="""Set the threshold of the Pearson r value for the filter. (default: %(default)s)""") parser.add_argument('--pval-filter-thresh', type=float, default=0.05, help="""Set the upper threshold for the p-value of the Pearson r values to keep. (default: %(default)s)""") parser.add_argument('--tx-name-header', type=str, required=True, help="""The text of the header in the expn table where tx names are stored. (default: %(default)s)""") parser.add_argument('--cond-headers', type=str, required=True, nargs='+', help="""A list of the text of the headers in the expn table where the values for each condition are stored (--cond-headers cond1 cond2 ...). (default: %(default)s)""") parser.add_argument('--manual-headers', type=str, required=False, nargs='?', help="""If the expn table does not have headers, provide a list of ordered names for them here. (default: %(default)s)""") parser.add_argument('--gtf', type=str, required=True, help="""The path to the gtf file that you want to use for your annotation. (default: %(default)s)""") parser.add_argument('--gtf-index', type=str, required=True, help="""The path to the gtf index file generated from "gtf_to_genes". (default: %(default)s)""") parser.add_argument('--genome-fastas', type=str, required=True, nargs='+', help="""A list of paths to genomic fasta files or directories where they are stored. (default: %(default)s)""") parser.add_argument('--flank-len', type=int, default=2000, help="""The length in bp that should be harvested from the 5' end of the tx. (default: %(default)s)""") parser.add_argument('--out-dir', type=str, default='.', help="""A path to a directory where you would like the output files to be stored. (default: %(default)s)""") parser.add_argument('--dump-megafasta', action='store_true', help="""Save concatonated fasta file for debugging. (default: %(default)s)""") parser.add_argument('--dump-stats', action='store_true', help="""Print a list of Tx/gene names and the r- p-values that passed the filter and exit without getting fastas. (default: %(default)s)""") args = parser.parse_args() # tmp files will be stored here tmp_files = Bag() # 1: Use a correlation filter to pull out any Tx that is sufficiently similar to the model Tx vectDict = mangle_expn_vectors(expnPath=args.expn_path,txNameHeader=args.tx_name_header,condHeaders=args.cond_headers,manualHeaders=args.manual_headers) filterFunc = eval("lambda x: x %s %f" % (args.pearson_filter_type, args.pearson_filter_thresh)) filterDict = pearsonExpnFilter(modelVector=vectDict[args.tx_name], targetVectors=vectDict, filterFunc=filterFunc) # remove vectors whose r's pVal is not significant (<=0.05) sigVectors = {} for key in filterDict: if key[1] <= args.pval_filter_thresh: sigVectors[key] = filterDict[key] matchVectors = sigVectors ## Impose a distance filter to further refine the gene set ## incorperating magnitudes of the absolute levels of gene expression ## set the boundries of acceptable deviation for the target gene mean expression ## mangitude by bootstrapping. The metric for comparison will be the average of ## the differences of each point in remaining vectors against the target ## vector. ## 1) calc the metrics for each remaining gene's vector ## PS: numpy rocks. ##avgDists = {} ##for key in sigVectors: ##avgDist_i = np.mean(np.subtract(vectDict[args.tx_name], ##sigVectors[key])) ##avgDists[key] = avgDist_i ### 2) bootstrap that bitch and give me a stdErr! ##medianEst,stdErrEst,lo95,hi95 = basic_bootstrap_est(avgDists.values()) ### 3) recover keys that fall within +/- 1 SE ##matchVectors = {} ##for key in avgDists: ##avgDist = avgDists[key] ##if (avgDist >= -stdErrEst) and (avgDist <= stdErrEst): ##matchVectors[key] = sigVectors[key] # Sort txList so that the highest r values are at the top # and save vectors and this info out to file txList = sorted(matchVectors.keys(),key=lambda x: x[0], reverse=True) sortedTxListFile = NamedTemporaryFile(mode='w+t',prefix='txExpnVectFilteredBy_r.',suffix=".tsv",delete=False) for row in txList: if args.dump_stats: sys.stdout.write('%s\t%s\n' % ('\t'.join(map(str,row)),'\t'.join(map(str,matchVectors[row])))) else: sortedTxListFile.write('%s\t%s\n' % ('\t'.join(map(str,row)),'\t'.join(map(str,matchVectors[row])))) if args.dump_stats: sortedTxListFile.close() exit(0) tmp_files['sortedTxListFile'] = sortedTxListFile sortedTxListFile.close() g2gObj = gtf_to_genes.get_indexed_genes_matching_gtf_file_name(index_file_name=args.gtf_index, logger=logger, regex_str=args.gtf)[-1] txDict = filter_GTF_4_Tx(txList=[x[2] for x in txList],g2gObj=g2gObj) tmp_files['txBedFile'] = convert_2_bed(txDict=txDict) # 2: Use GTFs, BEDtools, and genome FASTAs to extract the upstream flanking sequences into a new FASTA fastaRecLengths,fastaSeqs = fastaRec_length_indexer(fastaFiles=args.genome_fastas) tmpFastaRecLengthFile = NamedTemporaryFile(mode='w+b',prefix='tmpFastaRecLengthFile.',suffix=".txt") for seqRec in fastaRecLengths: tmpFastaRecLengthFile.write("%s\t%s\n" % (seqRec,fastaRecLengths[seqRec])) tmpFastaRecLengthFile.flush() # TODO: concatonate fasta files megaFastaFile = NamedTemporaryFile(mode='w+b',prefix='tmpMegaFastaFile.',suffix=".fas") for fasta in fastaSeqs: megaFastaFile.write('>%s\n%s\n' % (fasta,fastaSeqs[fasta])) megaFastaFile.flush() tmp_files['flankBed'] = get_fastas(txBed=tmp_files.txBedFile.name,genomeFasta=megaFastaFile.name,lenIndex=tmpFastaRecLengthFile.name,lenFlanks=args.flank_len) # CLEAN UP: # TODO: Close all tmp_files, and move to args.outDir mkdirp(args.out_dir) for f in tmp_files: try: tmp_files[f].delete = False except AttributeError: pass try: tmp_files[f].close() except AttributeError: pass # ['sortedTxListFile', 'flankBed', 'txBedFile', 'flankFasta'] sortedTxListFile = "%s/sortedTxList.tsv" % (args.out_dir) flankBed = "%s/flankBed.bed" % (args.out_dir) txBedFile = "%s/txBed.bed" % (args.out_dir) flankFasta = "%s/flankFasta.fas" % (args.out_dir) shutil.move(tmp_files.sortedTxListFile.name, sortedTxListFile) os.chmod(sortedTxListFile,0775) tmp_files.flankBed.saveas(flankBed) os.chmod(flankBed,0775) shutil.move(tmp_files.txBedFile.name, txBedFile) os.chmod(txBedFile,0775) shutil.move(tmp_files.flankBed.seqfn, flankFasta) os.chmod(flankFasta,0775) if args.dump_megafasta: megaFasta = "%s/megaFasta.fas" % (args.out_dir) megaFastaFile.delete = False megaFastaFile.close() shutil.move(megaFastaFile.name, megaFasta) os.chmod(megaFasta,0775)
def divByWindow(bedA_Path, bedB_Path, win=[500, 500], cols=[6, 6], side="right", outDir="."): """Create files separating features in bedB by those alling within the area defined by <win> and those outside this area in bedA. If A.bed is stranded, the area is defined by win[0] upstrm and win[1] dwnstrm on the FEATURE's strand. Otherwise its win[0] upstrm and win[1] dwnstrm on the CONTIG/CHROM's plus strand. Files ouput to outDir. NOTE: See DOC for extractFromDoubleSidedBedtoolOut() regarding 'cols' and 'side'""" # Prepare outDir if it doesnt already exist mkdirp(outDir) # Collect some useful info bedA_name = bedA_Path.split("/")[-1].replace(".bed", "") bedB_name = bedB_Path.split("/")[-1].replace(".bed", "") B_in_A_winComboPath = "%s/%s_featsIn_%s_Win%sl%sr_combo.bed" % (outDir, bedB_name, bedA_name, win[0], win[1]) # Establish whether inputs look like BED files: testA = open(bedA_Path, "rU") testB = open(bedB_Path, "rU") linesA = [] linesB = [] for i in range(2): linesA.append(testA.readline()) linesB.append(testB.readline()) testA.close() testB.close() if not isBEDline(linesA[1]): raise InvalidFileFormatError("%s does not seem to be in BED format." % (bedA_Path)) if not isBEDline(linesB[1]): raise InvalidFileFormatError("%s does not seem to be in BED format." % (bedB_Path)) # If bedA is stranded: use windowBed with -sw option, otherwise with only -l,-r options # to create file from bedB features INSIDE window around features in bedA. if isStranded(linesA[1]): resultWinBed = runExternalApp( "windowBed", "-a %s -b %s -l %s -r %s -sw > %s" % (bedA_Path, bedB_Path, win[0], win[1], B_in_A_winComboPath), ) else: resultWinBed = runExternalApp( "windowBed", "-a %s -b %s -l %s -r %s > %s" % (bedA_Path, bedB_Path, win[0], win[1], B_in_A_winComboPath) ) # Clean B_in_A_winComboPath of the matching bedA entry and remove any redundant bedB entries cleanedBsInWinPath = extractFromDoubleSidedBedtoolOut(B_in_A_winComboPath, cols=cols, side=side, outDir=outDir) # Change file name to reflect its not combo anymore cleanedBsInWinNewPath = cleanedBsInWinPath.replace("_combo_", "_cleaned_") resultMv = runExternalApp("mv", "%s %s" % (cleanedBsInWinPath, cleanedBsInWinNewPath)) # Create file with bedB feats OUTSIDE of window of features in bedA. cleanedBsNotInWinPath = cleanedBsInWinNewPath.replace("_featsIn_", "_featsNotIn_") onlyInA(bedB_Path, cleanedBsInWinNewPath, cleanedBsNotInWinPath) # resultIsectBed = runExternalApp('intersectBed','-a %s -b %s -v > %s' % \ # (bedB_Path, # cleanedBsInWinNewPath, # cleanedBsNotInWinPath)) # Return Filenames of divided bed files return (cleanedBsInWinNewPath, cleanedBsNotInWinPath)
def main(): #+++++++++++ File Parseing Etc +++++++++++ desc = """Calls the folowing funcs: 'add this'""" usage = """python %prog args""" parser = optparse.OptionParser(usage=usage, description=desc) parser.add_option('--motifs', type='str',default=None, help="""Path to motif file (default=%default).""") parser.add_option('--motif-type', type='str',default='scope', help="""Format of motif file (default=%default).""") parser.add_option('--motif-filter', type='str',default='lambda x: x', help="""Filter fuction to allow filtering of motifs in motif file. Its a lambda function. The default returns all motifs. If you don't understand this, please leave it alone. (default=%default).""") parser.add_option('--thresh', type='float',default=0.75, help="""Fractional score threshold for motif score cut-off (default=%default).""") parser.add_option('--promoters', type='str',default=None, help="""Path to fasta file with promoter population (default=%default).""") parser.add_option('--plen', type='int',default=None, help="""Max promoter length to use -- starting from 3'-end!! (default=%default).""") parser.add_option('--genes', type='string',default=None, help="""Unbroken string of gene/Tx names representing the true forground set, sep=','. Exp: 'gene,gene,gene' (default=%default).""") parser.add_option('--job', type='string',default='int(time())', help="""String to identify this run (default=%default).""") parser.add_option('--out', type='string',default=None, help="""Path to results dir -- must use full path (default=%default).""") parser.add_option('--plot-fdrs', dest="fdrs", type='int',default=0, help="""How many random sets of genes equal in length to --genes to run for FDR estimation. (default=%default).""") ##parser.add_option('--from-possum',default=False, ##help="""Path to possumSearch outFile, skips motif finding step. (default=%default)""") parser.add_option('--verbose',action='store_true',default=False, help="""Include if stdOut/stdErr is desired. (default=%default).""") parser.add_option('--check-seqs',action='store_true',default=False, help="""Print info about promoter sequences and exit. (default=%default).""") ##parser.add_option('--expect',action='store_true',default=False, ##help="""Use median seqLen to set success threshold at greater than the estimated expected number of occurences in each promoter [pValThresh*searchesPerSeq]. (default=%default).""") (opts, args) = parser.parse_args() # +++++ Argument Validations +++++ if len(sys.argv) == 1: parser.print_help() exit(0) if not opts.out: raise InvalidOptionError("--out argument is required.") if not opts.genes: raise InvalidOptionError('--genes argument is required.') if opts.job == 'int(time())': opts.job = int(time()) if not (opts.motifs or opts.motif_type or opts.thresh or opts.promoters): raise InvalidOptionError("""Unless --from-possum, the following argument are ALL required: --motif-type --thresh""") if not opts.motif_filter.startswith('lambda x:'): raise InvalidOptionError("**ERROR: the --motif-function option must begin with 'lambda x:'**") else: opts.motif_filter = eval(opts.motif_filter) if opts.plen: try: opts.plen = int(opts.plen) except ValueError: raise InvalidOptionError("**ERROR: the --plen option must be a number**") # +++++ Lets Begin +++++ if opts.verbose: sys.stdout.write('\n%s\n\n' % (' '.join(sys.argv))) if opts.verbose: sys.stdout.write('preparing out directory...\n') outBaseStr = getOutBaseStr(opts.out,opts.job) mkdirp(opts.out) if opts.verbose: sys.stdout.write('building seqDict...\n') probeset = getProbSet(opts.promoters,opts.thresh) #print len(probeset.probes.items()[0][1]) #print probeset.probes.items()[0][1] if opts.plen: if opts.verbose: sys.stdout.write("adjusting promoter lengths to no more than %s and conserving the 3' ends...\n" % (opts.plen)) for s in probeset.probes.iteritems(): probeset.probes[s[0]]= s[1][-opts.plen:] #print len(probeset.probes.items()[0][1]) #print probeset.probes.items()[0][1] seqDict = getSeqs(probeset) if opts.check_seqs: seqStats(seqDict,show=True) exit(0) if opts.verbose: sys.stdout.write('getting nucFreqs...\n') nucFreqs = getSeqFreqs(seqDict) if opts.verbose: sys.stdout.write('building filtered motifList...\n') motifList = getMotifs(opts.motifs,nucFreqs,opts.motif_type) preFilt = len(motifList) motifList = filterMotifs(motifList,key=opts.motif_filter) postFilt = len(motifList) if opts.verbose: sys.stdout.write('using %s of %s motifs...\n' % (postFilt,preFilt)) if opts.verbose: sys.stdout.write('getting forgroundSeqs...\n') foregroundSeqs = getForeground(opts.genes) outData = {} if opts.verbose: sys.stdout.write('calculating real data p-values...\n') realData = motifHyprGeoEnrichmentTAMO(motifList,probeset,foregroundSeqs,factor=opts.thresh,bestFactor=False) appendData(realData,outData) for i in range(opts.fdrs): if opts.verbose: sys.stdout.write('ctrl_%s:\n' % (i)) if opts.verbose: sys.stdout.write('\tgetting random forground...\n') rForground = getRandomForground(foregroundSeqs,seqDict) if opts.verbose: sys.stdout.write('\tcalculating p-values...\n') ctrlData = motifHyprGeoEnrichmentTAMO(motifList,probeset,rForground,factor=opts.thresh,bestFactor=False) appendData(ctrlData,outData) if opts.verbose: sys.stdout.write('writting outData...\n') writeDataTable(outBaseStr,outData,motifList) if opts.verbose: sys.stdout.write('plotting histograms...\n') for m in motifList: plotHist(outBaseStr,outData,m.id)
parser.add_option('--bt-opts', dest="bt_opts", type='string',default=defaultBtOpts, help="""Quoted string to pass as arguments to bowtie that have not already been provided. Defualt mimics Eland. Unless --override is used, the options will be appended to the default. (default=%default)""") parser.add_option('--out-dir', dest="out_dir", type='string',default=None, help="""Central directory to deposit output files. (default=%default)""") parser.add_option('--cent-log', dest="cent_log", action='store_true',default=False, help="""Include to redirect stdout and stderr to a central log file in out_dir. [useful for reproducibility and debugging] (default=%default)""") parser.add_option('--override', dest="override", action='store_true',default=False, help="""A switch that causes the default --bt-opts to be replaced by those provided as arguments to --bt-opts instead of adding to them. (default=%default)""") (opts, args) = parser.parse_args() if len(sys.argv) == 1: parser.print_help() exit(0) if opts.out_dir: mkdirp(opts.out_dir) opts.out_dir = opts.out_dir.rstrip('/') else: opts.out_dir = os.getcwd() if opts.cent_log: start_sitrep(opts.out_dir) if not 'BOWTIE_INDEXES' in os.environ: raise Exception('ERROR: please set the BOWTIE_INDEXES environment variable!') try: opts.bt_index opts.bam_base opts.fastqs except KeyError: raise MissingArgumentError('missing at least one of the required command line arguments: --bt-index, --bam-base, --fastqs') if not opts.override: opts.bt_opts = "%s %s" % (defaultBtOpts,opts.bt_opts)