コード例 #1
0
def main():
    """Inputs:
    -- Txt table file containing some tab-delim file path inputs for files.filter_PEfastQs()
           PE_FastqPathFwd<tab>PE_FastqPathRev<tab>OutputComboBaseName<newLine>
    -- String representing a lambda func to act as filter for fastqRecs
    Outputs:
    -- Writes filtered data to paths specified in the input txt table file"""
    
    desc  = """This script filters paired fastq files based on a provided lambda filter logic.  Input and output paths are determined by the input table file."""
    
    parser = argparse.ArgumentParser(description=desc)
    
    parser.add_argument('input_table', type=str,
                        help="""Path to input table file.""")
    parser.add_argument('filter_func', type=str,
                        help="""lambda filter function (quoted string).""")
    parser.add_argument('out_dir', type=str,
                        help="""Path to out directory.""")





    
    
    args = parser.parse_args()
    
    # print the called command:
    sys.stderr.write("%s\n" % (" ".join(sys.argv)))
    
    # open and parse the input table file
    inputs = [x.strip('\n').split('\t') for x in open(args.input_table,'rU')]
    for i in inputs:
        if len(i) != 3:
            raise InvalidFileFormatError("At least one line in %s does not have exactly three columns:\n%s\n" % (args.input_table,i))
    
    # create out_dir if needed
    mkdirp(args.out_dir)
    
    # set up and unleash the subprocesses
    filtFunc = eval(args.filter_func,{"__builtins__":None})
    jobs = []
    for i in inputs:
        arguments = [filtFunc, i[0], i[1]]
        # build the output file names (matchedPassPath1,matchedPassPath2,singlePassPath,nonPassPath) from baseNames 
        arguments.append("%s/%s.filtered.mated.fastq"   % (args.out_dir.rstrip('/'),i[0].split('/')[-1].split('.fastq')[0]))  # matchedPassPath1
        arguments.append("%s/%s.filtered.mated.fastq"   % (args.out_dir.rstrip('/'),i[1].split('/')[-1].split('.fastq')[0]))  # matchedPassPath2
        arguments.append("%s/%s.filtered.singled.fastq" % (args.out_dir.rstrip('/'),i[2].split('/')[-1].split('.fastq')[0]))  # singlePassPath
        arguments.append("%s/%s.filtered.failed.fastq"  % (args.out_dir.rstrip('/'),i[2].split('/')[-1].split('.fastq')[0]))  # nonPassPath
        
        p = mp.Process(target=filter_PEfastQs,args=tuple(arguments))
        jobs.append(p)
        p.start()
    
    return jobs
コード例 #2
0
ファイル: align.py プロジェクト: xguse/rSeqPipeline
def bowtie_index(reference_in,ebwt_outfile_base,runDir,options=None):
    """Create bowtie indexes from new fasta set.
    options : quoted string representing valid cmd line bowtie-build options
    runDir  : path to dir to place stdErr/stdOut logs - all steps of pipeline scripts should share same runDir
    
    ----------
    bowtie-build help text:
    
    Usage: bowtie-build [options]* <reference_in> <ebwt_outfile_base>
        reference_in            comma-separated list of files with ref sequences
        ebwt_outfile_base       write Ebwt data to files with this dir/basename
    Options:
        -f                      reference files are Fasta (default)
        -c                      reference sequences given on cmd line (as <seq_in>)
        -C/--color              build a colorspace index
        -a/--noauto             disable automatic -p/--bmax/--dcv memory-fitting
        -p/--packed             use packed strings internally; slower, uses less mem
        -B                      build both letter- and colorspace indexes
        --bmax <int>            max bucket sz for blockwise suffix-array builder
        --bmaxdivn <int>        max bucket sz as divisor of ref len (default: 4)
        --dcv <int>             diff-cover period for blockwise (default: 1024)
        --nodc                  disable diff-cover (algorithm becomes quadratic)
        -r/--noref              don't build .3/.4.ebwt (packed reference) portion
        -3/--justref            just build .3/.4.ebwt (packed reference) portion
        -o/--offrate <int>      SA is sampled every 2^offRate BWT chars (default: 5)
        -t/--ftabchars <int>    # of chars consumed in initial lookup (default: 10)
        --ntoa                  convert Ns in reference to As
        --seed <int>            seed for random number generator
        -q/--quiet              verbose output (for debugging)
        -h/--help               print detailed description of tool and its options
        --usage                 print this usage message
        --version               print version information and quit

    """
    
    # make runDir if it does not yet exist
    print "creating: %s" % (runDir)
    mkdirp(runDir)
    
    # Construct cmdArgs
    if options:
        cmdArgs = "%s %s %s" % (options,reference_in,ebwt_outfile_base)
    # Run bowtie-build and capture output
    btBuildResults = runExternalApp('bowtie-build',cmdArgs)
    
    # Report bowtie-build stdout and stderr
    if btBuildResults[0]:
        print('[%s] %s' % (whoami(),btBuildResults[0]))
    if btBuildResults[1]:
        sys.stderr('[%s] %s' % (whoami(),btBuildResults[1]))
        
    return btBuildResults
コード例 #3
0
ファイル: initNewGenome.py プロジェクト: asntech/rSeqPipeline
def init_dir_structure(spcName,versionID,baseDir,isCurrent=False):
    """Ensure that the correct directory structure exists to accept
    the genome and index data files.  Create anything that does not already exist.
    If isCurrent != False: soft link the versionID dir as "current"
    
    Example args:
    spcName = 'aedes_aegypti'
    versionID   = 'release_7'
    baseDir     = '/home/data'
    """
    
    spcVerDir   = '%s/genomes/%s/%s' % (baseDir,spcName,versionID)
    fastas      = '%s/fasta/' % (spcVerDir)
    annotations = '%s/annotations/' % (spcVerDir)
    mysql       = '%s/mysql/' % (spcVerDir)
    indexes     = '%s/indexes/' % (baseDir)
    
    print "creating dir: %s" % (fastas)
    mkdirp(fastas)
    print "creating dir: %s" % (annotations)
    mkdirp(annotations)
    print "creating dir: %s" % (mysql)
    mkdirp(mysql)
    print "creating dir: %s" % (indexes)
    mkdirp(indexes)
    
    
    if isCurrent:
        print "creating sym link from %s to 'current' because <isCurrent> is set to True" % (spcVerDir)
        os.symlink(spcVerDir, '%s/genomes/%s/current' % (baseDir,spcName))
コード例 #4
0
ファイル: bed.py プロジェクト: asntech/rSeqPipeline
def extractFromDoubleSidedBedtoolOut(filePath, cols, side="right", outDir="."):
    """Creates new file from filePath using only the bedInfo from the
    left/right (based on 'side') side of a BEDtools outFile with double-
    sided output (side=[3,6]). 'cols' must be a list with length of columns
    in each 'side' of the double output.  'side' = keep the 'right' or
    'left' side of the output line."""

    # Prepare outDir if it doesnt already exist
    mkdirp(outDir)

    inFile = open(filePath, "rU")
    outFilePath = "%s/%s_%s.bed" % (outDir, filePath.replace(".bed", "").split("/")[-1], side)
    outFile = open(outFilePath, "w")
    lineNum = 0
    for line in inFile:
        lineNum += 1
        if line.startswith("track"):
            continue
        line = line.strip("\n").split("\t")

        # Divide the line into two based on cols
        divLine = line[: cols[0]], line[-(len(line) - cols[0]) :]

        # Ensure the length of each new line is what we expect, then write out cleaned line
        if not ((len(divLine[0]) == cols[0]) and (len(divLine[1]) == cols[1])):
            raise InvalidFileFormatError(
                'line %s in file %s has unexpected number of columns or the values in "cols" is incorrect.'
                % (lineNum, filePath)
            )
        else:
            if side == "right":
                outFile.write("%s\n" % ("\t".join(divLine[1])))
            elif side == "left":
                outFile.write("%s\n" % ("\t".join(divLine[0])))
            else:
                raise InvalidOptionError('option "side" must be one of %s. Was: %s.' % (["right", "left"], side))

    outFile.close()
    # Sort and  remove redundancy from line in new file
    resultSort = runExternalApp("sort", "-u %s > %s.tmp" % (outFilePath, outFilePath))
    resultMv = runExternalApp("mv", "%s.tmp %s" % (outFilePath, outFilePath))
    return outFilePath
コード例 #5
0
ファイル: rScope.py プロジェクト: asntech/rSeqPipeline
def runSCOPE(pLen,genes,jobName,scopeDir,outDir,paramName,jMem='2000',verbose=False):
    """Perform a SCOPE run. Complain and quit if error occurs.
    Notes:
    scopeDir = full path.
    genes    = 'gene;gene;gene;etc'
    pLen     = promorter length to use."""
    # Get full path (if not given) for outDir since we will be jumping around in the directory tree
    if not outDir.startswith('/'):
        outDir = os.getcwd()+'/'+outDir
        outDir = outDir.rstrip('/')
    else:
        outDir = outDir.rstrip('/')
    
    # Set up argString
    outPathBase = '%s/%s.%s' % (outDir,jobName,pLen)
    argString = '''-Xmx%sm -cp dist/scope.jar edu.dartmouth.bglab.beam.CGIScope -pf "%s" -ofx "%s.xml" -oft "%s.txt" -oje "%s" -qg "%s" -sgl "%s" -drb "true" -dra "true" -drbp "true"''' \
              % (jMem,
                 paramName,
                 outPathBase,
                 outPathBase,
                 jobName,genes,pLen)
    
    # Change to scopeDir for execution bc SCOPE is a PITA.
    os.chdir(scopeDir)
    mkdirp(outDir) # make outDir along with parent dirs as needed
    print 'starting run...'
    resultSCOPE = runExternalApp('java',argString)
    
    # write stdOut/Err to files if requested
    if verbose:
        stdOutFile = open(outPathBase+'.out','w')
        stdErrFile = open(outPathBase+'.err','w')
        stdOutFile.write(resultSCOPE[0])
        stdErrFile.write(resultSCOPE[1])
        stdOutFile.close()
        stdErrFile.close()
    
    return resultSCOPE
コード例 #6
0
def plotScatter(pearsonStats,normedTxCntsList,opts):
    """"""
    fig = pl.figure()
    ax  = fig.add_subplot(111)
    if opts.log:
        ax.set_xscale('log')
        ax.set_yscale('log')
    
    
    ax.scatter(normedTxCntsList[0],normedTxCntsList[1], s=15, c='b', marker='o', alpha=1)
    if not opts.log:
        ax.set_autoscale_on(False)
    ax.set_xlabel(opts.name_a)
    ax.set_ylabel(opts.name_b)
    upperLim = max(normedTxCntsList[0]+normedTxCntsList[1])
    

    
    m,b  = pl.polyfit(normedTxCntsList[0],normedTxCntsList[1],1)
    bfYs = pl.polyval([m,b], [1,max(normedTxCntsList[0])])
    
    ax.plot([1,max(normedTxCntsList[0])],bfYs,'r-')
    
    pl.text(0.01,0.99,'Pearson: %.4f, %s\nBest Fit: y=%.3f*x+%.3f' % (pearsonStats[0],pearsonStats[1],m,b),
            bbox=dict(facecolor='#87AACD', alpha=1),
            horizontalalignment='left',
            verticalalignment='top',
            transform = ax.transAxes)
    
    mkdirp(opts.dir)
    if not opts.log:
        pl.savefig('%s%s_vs_%s.png' % (opts.dir,opts.name_a,opts.name_b))
    else:
        pl.savefig('%s%s_vs_%s.log.png' % (opts.dir,opts.name_a,opts.name_b))
    print 'Show?  %s' % (opts.show)
    if opts.show:
        pl.show()
コード例 #7
0
def main():
        #+++++++++++ File Parseing Etc +++++++++++
    desc = """Calls the folowing funcs: 'add this'"""
    
    usage = """python %prog args"""
    parser = optparse.OptionParser(usage=usage, description=desc)
    
    parser.add_option('--motifs', type='str',default=None,
                      help="""Path to motif file (default=%default).""")
    parser.add_option('--motif-type', type='str',default='scope',
                      help="""Format of motif file (default=%default).""")
    parser.add_option('--thresh', type='float',default=0.005,
                      help="""P-value threshold for motif score cut-off (default=%default).""")
    parser.add_option('--promoters', type='str',default=None,
                      help="""Path to fasta file with promoter population (default=%default).""")
    parser.add_option('--plen', type='int',default=1000,
                      help="""Max promoter length to use -- starting from 3'-end!! (default=%default).""")
    parser.add_option('--genes', type='string',default=None,
                      help="""Unbroken string of gene/Tx names representing the true forground set, sep=','. Exp: 'gene,gene,gene' (default=%default).""")
    parser.add_option('--job', type='string',default='int(time())',
                      help="""String to identify this run (default=%default).""")
    parser.add_option('--out', type='string',default=None,
                      help="""Path to results dir -- must use full path (default=%default).""")
    parser.add_option('--plot-fdrs', dest="fdrs", type='int',default=0,
                      help="""How many random sets of genes equal in length to --genes to run for FDR estimation. (default=%default).""")
    parser.add_option('--from-possum',default=False,
                      help="""Path to possumSearch outFile, skips motif finding step. (default=%default)""")    
    parser.add_option('--verbose',action='store_true',default=False,
                      help="""Include if stdOut/stdErr is desired. (default=%default).""")
    parser.add_option('--check-seqs',action='store_true',default=False,
                      help="""Print info about promoter sequences and exit. (default=%default).""")
    parser.add_option('--expect',action='store_true',default=False,
                      help="""Use median seqLen to set success threshold at greater than the estimated expected number of occurences in each promoter [pValThresh*searchesPerSeq]. (default=%default).""")
    

    
    (opts, args) = parser.parse_args()
    
    # +++++ Argument Validations +++++
    if len(sys.argv) == 1:
        parser.print_help()
        exit(0)
    if not opts.out:
        raise InvalidOptionError("--out argument is required.")
    if not opts.genes:
        raise InvalidOptionError('--genes argument is required.')
    if opts.job == 'int(time())':
        opts.job = int(time())
    if not opts.from_possum:
        if not opts.motifs or opts.motif_type or opts.thresh or opts.promoters or opts.plen:
            raise InvalidOptionError("""Unless --from-possum, the following argument are ALL required:
            --motif-type
            --thresh
            --plen""")
    else:
        if not opts.motifs:
            raise InvalidOptionError("When using --from-possum, --motifs should be the PSSM file used to generate this particular hit set.")
    
    # +++++ Lets Begin +++++
    if opts.verbose: sys.stdout.write('\n%s\n\n' % (' '.join(sys.argv)))
    
    if opts.verbose: sys.stdout.write('preparing out directory...\n')
    outBaseStr = getOutBaseStr(opts.out,opts.job)
    mkdirp(opts.out)
    
    if opts.verbose: sys.stdout.write('building seqDict...\n')
    seqDict = getSeqs(opts.promoters,opts.plen)
    if opts.check_seqs:
        seqStats(seqDict,show=True)
        exit(0)
    else:
        seqData = seqStats(seqDict,show=False)
    if opts.expect:
        opts.expect = seqData['medLen']*opts.thresh*2
    else:
        opts.expect = 0
        
    # --- am i doing the searching myself? ---
    if not opts.from_possum:
        # -- yes --
    
        if opts.verbose: sys.stdout.write('building motifList...\n')
        motifList      = getMotifs(opts.motifs,opts.motif_type)
    
        if opts.verbose: sys.stdout.write('getting nucFreqs...\n')
        halfAT,halfGC  = getSeqFreqs(seqDict)
    
        if opts.verbose: sys.stdout.write('building hitDict...\n')
        motifHits      = getEvalHitDict(motifList,seqDict,pThresh=opts.thresh,halfAT=halfAT,halfGC=halfGC)
    else:
        # -- Oh thank god, no!  All I have to do is some parseing! --
        if opts.verbose: sys.stdout.write('skipping to building hitDict step...\n')
        pACs        = getPossumProfileACs(opts.motifs)
        possumTable = getPossumHitTable(opts.from_possum,headers=possumHeaders)
        motifHits   = getPossumHitDict(possumTable,seqDict.keys(),pACs)
        motifList   = makeMotifListFromPossum(pACs) # create list of DummyPlug MotifObjs for compatibility
    
    if opts.verbose: sys.stdout.write('getting forgroundSeqs...\n')
    foregroundSeqs = getForeground(opts.genes)
    outData        = {}
    
    if opts.verbose: sys.stdout.write('calculating real data p-values...\n')
    realData = motifHyprGeoEnrichment(motifList,motifHits,foregroundSeqs,opts.expect)
    appendData(realData,outData)
    
    
    for i in range(opts.fdrs):
        if opts.verbose: sys.stdout.write('ctrl_%s:\n' % (i))
        if opts.verbose: sys.stdout.write('\tgetting random forground...\n')
        rForground = getRandomForground(foregroundSeqs,seqDict)
        
        if opts.verbose: sys.stdout.write('\tcalculating p-values...\n')
        ctrlData   = motifHyprGeoEnrichment(motifList,motifHits,rForground,opts.expect)
        appendData(ctrlData,outData)
    
    if opts.verbose: sys.stdout.write('writting outData...\n')
    writeDataTable(outBaseStr,outData,motifList)
    
    
    if opts.verbose: sys.stdout.write('plotting histograms...\n')
    for m in motifList:
        plotHist(outBaseStr,outData,m.id)
コード例 #8
0
ファイル: align.py プロジェクト: xguse/rSeqPipeline
def bowtie_align(ebwt,readsString,hit,runDir,options=None):
    """Run alignment of fastQ to bowtie index.
    options     : quoted string representing valid cmd line bowtie-build options
    runDir      : path to dir to place stdErr/stdOut logs - all steps of pipeline scripts should share same runDir
    readsString : appropriate quoted string representing which fastq files to use (see bowtie -h).
    
    ----------
    bowtie help text:
    
    Usage: 
    bowtie [options]* <ebwt> {-1 <m1> -2 <m2> | --12 <r> | <s>} [<hit>]
  
    <m1>    Comma-separated list of files containing upstream mates (or the
            sequences themselves, if -c is set) paired with mates in <m2>
    <m2>    Comma-separated list of files containing downstream mates (or the
            sequences themselves if -c is set) paired with mates in <m1>
    <r>     Comma-separated list of files containing Crossbow-style reads.  Can be
            a mixture of paired and unpaired.  Specify "-" for stdin.
    <s>     Comma-separated list of files containing unpaired reads, or the
            sequences themselves, if -c is set.  Specify "-" for stdin.
    <hit>   File to write hits to (default: stdout)
    Input:
      -q                 query input files are FASTQ .fq/.fastq (default)
      -f                 query input files are (multi-)FASTA .fa/.mfa
      -r                 query input files are raw one-sequence-per-line
      -c                 query sequences given on cmd line (as <mates>, <singles>)
      -C                 reads and index are in colorspace
      -Q/--quals <file>  QV file(s) corresponding to CSFASTA inputs; use with -f -C
      --Q1/--Q2 <file>   same as -Q, but for mate files 1 and 2 respectively
      -s/--skip <int>    skip the first <int> reads/pairs in the input
      -u/--qupto <int>   stop after first <int> reads/pairs (excl. skipped reads)
      -5/--trim5 <int>   trim <int> bases from 5' (left) end of reads
      -3/--trim3 <int>   trim <int> bases from 3' (right) end of reads
      --phred33-quals    input quals are Phred+33 (default)
      --phred64-quals    input quals are Phred+64 (same as --solexa1.3-quals)
      --solexa-quals     input quals are from GA Pipeline ver. < 1.3
      --solexa1.3-quals  input quals are from GA Pipeline ver. >= 1.3
      --integer-quals    qualities are given as space-separated integers (not ASCII)
    Alignment:
      -v <int>           report end-to-end hits w/ <=v mismatches; ignore qualities
        or
      -n/--seedmms <int> max mismatches in seed (can be 0-3, default: -n 2)
      -e/--maqerr <int>  max sum of mismatch quals across alignment for -n (def: 70)
      -l/--seedlen <int> seed length for -n (default: 28)
      --nomaqround       disable Maq-like quality rounding for -n (nearest 10 <= 30)
      -I/--minins <int>  minimum insert size for paired-end alignment (default: 0)
      -X/--maxins <int>  maximum insert size for paired-end alignment (default: 250)
      --fr/--rf/--ff     -1, -2 mates align fw/rev, rev/fw, fw/fw (default: --fr)
      --nofw/--norc      do not align to forward/reverse-complement reference strand
      --maxbts <int>     max # backtracks for -n 2/3 (default: 125, 800 for --best)
      --pairtries <int>  max # attempts to find mate for anchor hit (default: 100)
      -y/--tryhard       try hard to find valid alignments, at the expense of speed
      --chunkmbs <int>   max megabytes of RAM for best-first search frames (def: 64)
    Reporting:
      -k <int>           report up to <int> good alignments per read (default: 1)
      -a/--all           report all alignments per read (much slower than low -k)
      -m <int>           suppress all alignments if > <int> exist (def: no limit)
      -M <int>           like -m, but reports 1 random hit (MAPQ=0); requires --best
      --best             hits guaranteed best stratum; ties broken by quality
      --strata           hits in sub-optimal strata aren't reported (requires --best)
    Output:
      -t/--time          print wall-clock time taken by search phases
      -B/--offbase <int> leftmost ref offset = <int> in bowtie output (default: 0)
      --quiet            print nothing but the alignments
      --refout           write alignments to files refXXXXX.map, 1 map per reference
      --refidx           refer to ref. seqs by 0-based index rather than name
      --al <fname>       write aligned reads/pairs to file(s) <fname>
      --un <fname>       write unaligned reads/pairs to file(s) <fname>
      --max <fname>      write reads/pairs over -m limit to file(s) <fname>
      --suppress <cols>  suppresses given columns (comma-delim'ed) in default output
      --fullref          write entire ref name (default: only up to 1st space)
    Colorspace:
      --snpphred <int>   Phred penalty for SNP when decoding colorspace (def: 30)
         or
      --snpfrac <dec>    approx. fraction of SNP bases (e.g. 0.001); sets --snpphred
      --col-cseq         print aligned colorspace seqs as colors, not decoded bases
      --col-cqual        print original colorspace quals, not decoded quals
      --col-keepends     keep nucleotides at extreme ends of decoded alignment
    SAM:
      -S/--sam           write hits in SAM format
      --mapq <int>       default mapping quality (MAPQ) to print for SAM alignments
      --sam-nohead       supppress header lines (starting with @) for SAM output
      --sam-nosq         supppress @SQ header lines for SAM output
      --sam-RG <text>    add <text> (usually "lab=value") to @RG line of SAM header
    Performance:
      -o/--offrate <int> override offrate of index; must be >= index's offrate
      -p/--threads <int> number of alignment threads to launch (default: 1)
      --mm               use memory-mapped I/O for index; many 'bowtie's can share
      --shmem            use shared mem for index; many 'bowtie's can share
    Other:
      --seed <int>       seed for random number generator
      --verbose          verbose output (for debugging)
      --version          print version information and quit
      -h/--help          print this usage message

    """
    # make runDir if it does not yet exist
    mkdirp(runDir)
    
    # Construct cmdArgs
    if options:
        cmdArgs = "%s %s  %s %s" % (options,ebwt,readsString,hit)
    else:
        cmdArgs = "%s %s %s" % (ebwt,readsString,hit)
        
    # Run and capture output
    print "Setting up bowtie call with the following cmd:\n\t\tbowtie %s" % (cmdArgs)
    btResults = runExternalApp('bowtie',cmdArgs)
    
    # Report stdout and stderr
    if btResults[0]:
        for line in btResults[0].split('\n'):
            print('[%s] %s' % (whoami(),line))
        
    if btResults[1]:
        for line in btResults[1].split('\n'):
            sys.stderr.write('[%s] %s' % (whoami(),line))
        
    return btResults
コード例 #9
0
                      help="""Directory path for output (default=%default).""")
    parser.add_option('--track', dest="track", type='string',default='untitled',
                      help="""Unbroken string to use for track name (default=%default).""")
    parser.add_option('--description', dest="description", type='string',default='none given',
                      help="""Quoted string for description of track (default=%default).""")
    parser.add_option('--rgb', dest="rgb", type='string',default='0,0,0',
                      help="""Unbroken comma separated string to denote color of track (default=%default).""")

    
    (opts, args) = parser.parse_args()
    
    if len(sys.argv) == 1:
        parser.print_help()
        exit(0)
    if opts.out.endswith('/'):
        opts.out = opts.out[:-1]
    if not len(opts.rgb.split(',')):
        raise InvalidOptionError('Malformed --rgb option: %s.' % (opts.rgb))
    
    # Prepare outdir 
    mkdirp(opts.out)
    outFile = open('%s/%s' % (opts.out,args[0].split('/')[-1].replace('.txt', '.bed')), 'w')
    outFile.write('track name=%s description="%s" useScore=0\n' % (opts.track,opts.description))
    
    # For every line starting "cigar:" convert to bed and write out
    for line in open(args[0],'rU'):
        if line.startswith('cigar:'):
            bedLine = exonerateCigar2BEDline(line,opts.rgb)
            outFile.write(bedLine)
    outFile.close()
コード例 #10
0
def main():
    """
    1: Collect Tx from one or more species that are within at least some r value of similarity to
       a provided example Tx or a submitted hypothetical expression vector.
    2: Use GTFs, BEDtools, and genome FASTAs to extract the upstream flanking sequences into a new FASTA
       for use in motif discovery.
    """
    
    desc = """(1) Collect Tx from one or more species that are within 
at least some r value of similarity to a provided example Tx or a 
submitted hypothetical expression vector. (2) Use GTFs, BEDtools, and 
genome FASTAs to extract the upstream flanking sequences into a new 
FASTA for use in motif discovery."""
    
    parser = argparse.ArgumentParser(description=desc)
    FileType = argparse.FileType
    
    logger = logging.getLogger(sys.argv[0].split('/')[-1])
    
    parser.add_argument('--expn-path', type=str, required=True,
                        help="""Path to expression table file. \n(default: %(default)s)""")
    parser.add_argument('--tx-name', type=str, required=True,
                        help="""Name of the Tx you want to use as a model. (default: %(default)s)""")
    parser.add_argument('--pearson-filter-type', type=str, default='>=', choices=['>=','<='],
                        help="""Use >= to find similar expn profiles or <= to find opposite profiles. (default: %(default)s)""")
    parser.add_argument('--pearson-filter-thresh', type=float, default=0.7,
                        help="""Set the threshold of the Pearson r value for the filter. (default: %(default)s)""")
    parser.add_argument('--pval-filter-thresh', type=float, default=0.05,
                            help="""Set the upper threshold for the p-value of the Pearson r values to keep. (default: %(default)s)""")    
    parser.add_argument('--tx-name-header', type=str, required=True,
                        help="""The text of the header in the expn table where tx names are stored. (default: %(default)s)""")
    parser.add_argument('--cond-headers', type=str, required=True, nargs='+',
                        help="""A list of the text of the headers in the expn table where the values for each condition are stored (--cond-headers cond1 cond2 ...). (default: %(default)s)""")
    parser.add_argument('--manual-headers', type=str, required=False, nargs='?',
                        help="""If the expn table does not have headers, provide a list of ordered names for them here. (default: %(default)s)""")
    parser.add_argument('--gtf', type=str, required=True,
                        help="""The path to the gtf file that you want to use for your annotation. (default: %(default)s)""")
    parser.add_argument('--gtf-index', type=str, required=True,
                        help="""The path to the gtf index file generated from "gtf_to_genes". (default: %(default)s)""")
    parser.add_argument('--genome-fastas', type=str, required=True, nargs='+',
                        help="""A list of paths to genomic fasta files or directories where they are stored. (default: %(default)s)""")
    parser.add_argument('--flank-len', type=int, default=2000,
                        help="""The length in bp that should be harvested from the 5' end of the tx. (default: %(default)s)""")
    parser.add_argument('--out-dir', type=str, default='.',
                        help="""A path to a directory where you would like the output files to be stored. (default: %(default)s)""")
    parser.add_argument('--dump-megafasta', action='store_true',
                        help="""Save concatonated fasta file for debugging. (default: %(default)s)""")
    parser.add_argument('--dump-stats', action='store_true',
                            help="""Print a list of Tx/gene names and the r- p-values that passed the filter and exit without getting fastas. (default: %(default)s)""")    
    
    args = parser.parse_args()
    
    # tmp files will be stored here
    tmp_files = Bag()
    
    # 1: Use a correlation filter to pull out any Tx that is sufficiently similar to the model Tx
    vectDict = mangle_expn_vectors(expnPath=args.expn_path,txNameHeader=args.tx_name_header,condHeaders=args.cond_headers,manualHeaders=args.manual_headers)
    
    filterFunc = eval("lambda x: x %s %f" % (args.pearson_filter_type, args.pearson_filter_thresh))
    filterDict = pearsonExpnFilter(modelVector=vectDict[args.tx_name], targetVectors=vectDict, filterFunc=filterFunc)
    
    # remove vectors whose r's pVal is not significant (<=0.05)
    sigVectors = {}
    for key in filterDict:
        if key[1] <= args.pval_filter_thresh:
            sigVectors[key] = filterDict[key]
    matchVectors = sigVectors
    
    ## Impose a distance filter to further refine the gene set
    ## incorperating magnitudes of the absolute levels of gene expression
    
    ## set the boundries of acceptable deviation for the target gene mean expression
    ## mangitude by bootstrapping.  The metric for comparison will be the average of
    ## the differences of each point in remaining vectors against the target
    ## vector.
    
    ## 1) calc the metrics for each remaining gene's vector
    ##    PS: numpy rocks.
    ##avgDists = {}
    ##for key in sigVectors:
        ##avgDist_i = np.mean(np.subtract(vectDict[args.tx_name],
                                           ##sigVectors[key]))
        ##avgDists[key] = avgDist_i
        
    ### 2) bootstrap that bitch and give me a stdErr!
    ##medianEst,stdErrEst,lo95,hi95 = basic_bootstrap_est(avgDists.values())
    
    ### 3) recover keys that fall within +/- 1 SE
    ##matchVectors = {}
    ##for key in avgDists:
        ##avgDist = avgDists[key]
        ##if (avgDist >= -stdErrEst) and (avgDist <= stdErrEst):
            ##matchVectors[key] = sigVectors[key]
    
        
    
    # Sort txList so that the highest r values are at the top
    # and save vectors and this info out to file
    txList = sorted(matchVectors.keys(),key=lambda x: x[0], reverse=True)
    sortedTxListFile = NamedTemporaryFile(mode='w+t',prefix='txExpnVectFilteredBy_r.',suffix=".tsv",delete=False)
    for row in txList:
        if args.dump_stats:
            sys.stdout.write('%s\t%s\n' % ('\t'.join(map(str,row)),'\t'.join(map(str,matchVectors[row]))))
        else:
            sortedTxListFile.write('%s\t%s\n' % ('\t'.join(map(str,row)),'\t'.join(map(str,matchVectors[row]))))
    if args.dump_stats:
        sortedTxListFile.close()
        exit(0)
        
    tmp_files['sortedTxListFile'] = sortedTxListFile
    sortedTxListFile.close()
    

    
    g2gObj = gtf_to_genes.get_indexed_genes_matching_gtf_file_name(index_file_name=args.gtf_index, logger=logger, regex_str=args.gtf)[-1]
    txDict = filter_GTF_4_Tx(txList=[x[2] for x in txList],g2gObj=g2gObj)
    tmp_files['txBedFile'] = convert_2_bed(txDict=txDict)
    
    # 2: Use GTFs, BEDtools, and genome FASTAs to extract the upstream flanking sequences into a new FASTA
    fastaRecLengths,fastaSeqs = fastaRec_length_indexer(fastaFiles=args.genome_fastas)
    tmpFastaRecLengthFile = NamedTemporaryFile(mode='w+b',prefix='tmpFastaRecLengthFile.',suffix=".txt")
    for seqRec in fastaRecLengths:
        tmpFastaRecLengthFile.write("%s\t%s\n" % (seqRec,fastaRecLengths[seqRec]))
    tmpFastaRecLengthFile.flush()

    # TODO: concatonate fasta files
    megaFastaFile = NamedTemporaryFile(mode='w+b',prefix='tmpMegaFastaFile.',suffix=".fas")
    for fasta in fastaSeqs:
        megaFastaFile.write('>%s\n%s\n' % (fasta,fastaSeqs[fasta]))
    megaFastaFile.flush()
        
    tmp_files['flankBed'] = get_fastas(txBed=tmp_files.txBedFile.name,genomeFasta=megaFastaFile.name,lenIndex=tmpFastaRecLengthFile.name,lenFlanks=args.flank_len)
    
    
    # CLEAN UP:
    # TODO: Close all tmp_files, and move to args.outDir
    mkdirp(args.out_dir)
    for f in tmp_files:
        try:
            tmp_files[f].delete = False
        except AttributeError:
            pass
        try:
            tmp_files[f].close()
        except AttributeError:
            pass
    # ['sortedTxListFile', 'flankBed', 'txBedFile', 'flankFasta']
    sortedTxListFile = "%s/sortedTxList.tsv" % (args.out_dir)
    flankBed         = "%s/flankBed.bed" % (args.out_dir)
    txBedFile        = "%s/txBed.bed" % (args.out_dir)
    flankFasta       = "%s/flankFasta.fas" % (args.out_dir)
    
    
    shutil.move(tmp_files.sortedTxListFile.name, sortedTxListFile)
    os.chmod(sortedTxListFile,0775)
    
    tmp_files.flankBed.saveas(flankBed)
    os.chmod(flankBed,0775)
    
    shutil.move(tmp_files.txBedFile.name, txBedFile)
    os.chmod(txBedFile,0775)
    
    shutil.move(tmp_files.flankBed.seqfn, flankFasta)
    os.chmod(flankFasta,0775)
    
    if args.dump_megafasta:
        megaFasta = "%s/megaFasta.fas" % (args.out_dir)
        megaFastaFile.delete = False
        megaFastaFile.close()
        shutil.move(megaFastaFile.name, megaFasta)
        os.chmod(megaFasta,0775)
コード例 #11
0
ファイル: bed.py プロジェクト: asntech/rSeqPipeline
def divByWindow(bedA_Path, bedB_Path, win=[500, 500], cols=[6, 6], side="right", outDir="."):
    """Create files separating features in bedB by those alling within the area defined
    by <win> and those outside this area in bedA.  If A.bed is stranded, the area is defined
    by win[0] upstrm and win[1] dwnstrm on the FEATURE's strand.  Otherwise its 
    win[0] upstrm and win[1] dwnstrm on the CONTIG/CHROM's plus strand.  Files ouput
    to outDir.
    
    NOTE: See DOC for extractFromDoubleSidedBedtoolOut() regarding 'cols' and 'side'"""

    # Prepare outDir if it doesnt already exist
    mkdirp(outDir)

    # Collect some useful info
    bedA_name = bedA_Path.split("/")[-1].replace(".bed", "")
    bedB_name = bedB_Path.split("/")[-1].replace(".bed", "")
    B_in_A_winComboPath = "%s/%s_featsIn_%s_Win%sl%sr_combo.bed" % (outDir, bedB_name, bedA_name, win[0], win[1])

    # Establish whether inputs look like BED files:
    testA = open(bedA_Path, "rU")
    testB = open(bedB_Path, "rU")
    linesA = []
    linesB = []
    for i in range(2):
        linesA.append(testA.readline())
        linesB.append(testB.readline())
    testA.close()
    testB.close()

    if not isBEDline(linesA[1]):
        raise InvalidFileFormatError("%s does not seem to be in BED format." % (bedA_Path))
    if not isBEDline(linesB[1]):
        raise InvalidFileFormatError("%s does not seem to be in BED format." % (bedB_Path))

    # If bedA is stranded: use windowBed with -sw option, otherwise with only -l,-r options
    # to create file from bedB features INSIDE window around features in bedA.

    if isStranded(linesA[1]):
        resultWinBed = runExternalApp(
            "windowBed",
            "-a %s -b %s -l %s -r %s -sw > %s" % (bedA_Path, bedB_Path, win[0], win[1], B_in_A_winComboPath),
        )
    else:
        resultWinBed = runExternalApp(
            "windowBed", "-a %s -b %s -l %s -r %s > %s" % (bedA_Path, bedB_Path, win[0], win[1], B_in_A_winComboPath)
        )

    # Clean B_in_A_winComboPath of the matching bedA entry and remove any redundant bedB entries
    cleanedBsInWinPath = extractFromDoubleSidedBedtoolOut(B_in_A_winComboPath, cols=cols, side=side, outDir=outDir)
    # Change file name to reflect its not combo anymore
    cleanedBsInWinNewPath = cleanedBsInWinPath.replace("_combo_", "_cleaned_")
    resultMv = runExternalApp("mv", "%s %s" % (cleanedBsInWinPath, cleanedBsInWinNewPath))

    # Create file with bedB feats OUTSIDE of window of features in bedA.
    cleanedBsNotInWinPath = cleanedBsInWinNewPath.replace("_featsIn_", "_featsNotIn_")
    onlyInA(bedB_Path, cleanedBsInWinNewPath, cleanedBsNotInWinPath)
    # resultIsectBed = runExternalApp('intersectBed','-a %s -b %s -v > %s' % \
    # (bedB_Path,
    # cleanedBsInWinNewPath,
    # cleanedBsNotInWinPath))

    # Return Filenames of divided bed files
    return (cleanedBsInWinNewPath, cleanedBsNotInWinPath)
コード例 #12
0
def main():
        #+++++++++++ File Parseing Etc +++++++++++
    desc = """Calls the folowing funcs: 'add this'"""
    
    usage = """python %prog args"""
    parser = optparse.OptionParser(usage=usage, description=desc)
    
    parser.add_option('--motifs', type='str',default=None,
                      help="""Path to motif file (default=%default).""")
    parser.add_option('--motif-type', type='str',default='scope',
                      help="""Format of motif file (default=%default).""")
    parser.add_option('--motif-filter', type='str',default='lambda x: x',
                      help="""Filter fuction to allow filtering of motifs in motif file.  Its a lambda function.  The default returns all motifs. If you don't understand this, please leave it alone. (default=%default).""")
    parser.add_option('--thresh', type='float',default=0.75,
                      help="""Fractional score threshold for motif score cut-off (default=%default).""")
    parser.add_option('--promoters', type='str',default=None,
                      help="""Path to fasta file with promoter population (default=%default).""")
    parser.add_option('--plen', type='int',default=None,
                      help="""Max promoter length to use -- starting from 3'-end!! (default=%default).""")
    parser.add_option('--genes', type='string',default=None,
                      help="""Unbroken string of gene/Tx names representing the true forground set, sep=','. Exp: 'gene,gene,gene' (default=%default).""")
    parser.add_option('--job', type='string',default='int(time())',
                      help="""String to identify this run (default=%default).""")
    parser.add_option('--out', type='string',default=None,
                      help="""Path to results dir -- must use full path (default=%default).""")
    parser.add_option('--plot-fdrs', dest="fdrs", type='int',default=0,
                      help="""How many random sets of genes equal in length to --genes to run for FDR estimation. (default=%default).""")
    ##parser.add_option('--from-possum',default=False,
                      ##help="""Path to possumSearch outFile, skips motif finding step. (default=%default)""")    
    parser.add_option('--verbose',action='store_true',default=False,
                      help="""Include if stdOut/stdErr is desired. (default=%default).""")
    parser.add_option('--check-seqs',action='store_true',default=False,
                      help="""Print info about promoter sequences and exit. (default=%default).""")
    ##parser.add_option('--expect',action='store_true',default=False,
                      ##help="""Use median seqLen to set success threshold at greater than the estimated expected number of occurences in each promoter [pValThresh*searchesPerSeq]. (default=%default).""")
    

    
    (opts, args) = parser.parse_args()
    
    # +++++ Argument Validations +++++
    if len(sys.argv) == 1:
        parser.print_help()
        exit(0)
    if not opts.out:
        raise InvalidOptionError("--out argument is required.")
    if not opts.genes:
        raise InvalidOptionError('--genes argument is required.')
    if opts.job == 'int(time())':
        opts.job = int(time())
    if not (opts.motifs or opts.motif_type or opts.thresh or opts.promoters):
        raise InvalidOptionError("""Unless --from-possum, the following argument are ALL required:
        --motif-type
        --thresh""")
    if not opts.motif_filter.startswith('lambda x:'):
        raise InvalidOptionError("**ERROR: the --motif-function option must begin with 'lambda x:'**")
    else:
        opts.motif_filter = eval(opts.motif_filter)
    if opts.plen:
        try:
            opts.plen = int(opts.plen)
        except ValueError:
            raise InvalidOptionError("**ERROR: the --plen option must be a number**")
    
    # +++++ Lets Begin +++++
    if opts.verbose: sys.stdout.write('\n%s\n\n' % (' '.join(sys.argv)))
    
    if opts.verbose: sys.stdout.write('preparing out directory...\n')
    outBaseStr = getOutBaseStr(opts.out,opts.job)
    mkdirp(opts.out)
    
    if opts.verbose: sys.stdout.write('building seqDict...\n')
    probeset = getProbSet(opts.promoters,opts.thresh)
    #print len(probeset.probes.items()[0][1])
    #print probeset.probes.items()[0][1]
    if opts.plen:
        if opts.verbose: sys.stdout.write("adjusting promoter lengths to no more than %s and conserving the 3' ends...\n" % (opts.plen))
        for s in probeset.probes.iteritems():
            probeset.probes[s[0]]= s[1][-opts.plen:]
        #print len(probeset.probes.items()[0][1])
        #print probeset.probes.items()[0][1]
            
    seqDict  = getSeqs(probeset)
    if opts.check_seqs:
        seqStats(seqDict,show=True)
        exit(0)
    
    if opts.verbose: sys.stdout.write('getting nucFreqs...\n')
    nucFreqs  = getSeqFreqs(seqDict)    

    if opts.verbose: sys.stdout.write('building filtered motifList...\n')
    motifList = getMotifs(opts.motifs,nucFreqs,opts.motif_type)
    preFilt   = len(motifList)
    motifList = filterMotifs(motifList,key=opts.motif_filter)
    postFilt  = len(motifList)
    if opts.verbose: sys.stdout.write('using %s of %s motifs...\n' % (postFilt,preFilt))
    
    if opts.verbose: sys.stdout.write('getting forgroundSeqs...\n')
    foregroundSeqs = getForeground(opts.genes)
    outData        = {}
    
    if opts.verbose: sys.stdout.write('calculating real data p-values...\n')
    realData = motifHyprGeoEnrichmentTAMO(motifList,probeset,foregroundSeqs,factor=opts.thresh,bestFactor=False)
    appendData(realData,outData)
    
    
    for i in range(opts.fdrs):
        if opts.verbose: sys.stdout.write('ctrl_%s:\n' % (i))
        if opts.verbose: sys.stdout.write('\tgetting random forground...\n')
        rForground = getRandomForground(foregroundSeqs,seqDict)
        
        if opts.verbose: sys.stdout.write('\tcalculating p-values...\n')
        ctrlData   = motifHyprGeoEnrichmentTAMO(motifList,probeset,rForground,factor=opts.thresh,bestFactor=False)
        appendData(ctrlData,outData)
    
    if opts.verbose: sys.stdout.write('writting outData...\n')
    writeDataTable(outBaseStr,outData,motifList)
    
    
    if opts.verbose: sys.stdout.write('plotting histograms...\n')
    for m in motifList:
        plotHist(outBaseStr,outData,m.id)
コード例 #13
0
 parser.add_option('--bt-opts', dest="bt_opts", type='string',default=defaultBtOpts,
                   help="""Quoted string to pass as arguments to bowtie that have not already been provided.  Defualt mimics Eland. Unless --override is used, the options will be appended to the default. (default=%default)""")
 parser.add_option('--out-dir', dest="out_dir", type='string',default=None,
                   help="""Central directory to deposit output files. (default=%default)""")
 parser.add_option('--cent-log', dest="cent_log", action='store_true',default=False,
                   help="""Include to redirect stdout and stderr to a central log file in out_dir. [useful for reproducibility and debugging] (default=%default)""")
 parser.add_option('--override', dest="override", action='store_true',default=False,
                   help="""A switch that causes the default --bt-opts to be replaced by those provided as arguments to --bt-opts instead of adding to them. (default=%default)""")
 
 (opts, args) = parser.parse_args()
 
 if len(sys.argv) == 1:
     parser.print_help()
     exit(0)
 if opts.out_dir:
     mkdirp(opts.out_dir)
     opts.out_dir = opts.out_dir.rstrip('/')
 else:
     opts.out_dir = os.getcwd()
 if opts.cent_log:
     start_sitrep(opts.out_dir)
 if not 'BOWTIE_INDEXES' in os.environ:
     raise Exception('ERROR: please set the BOWTIE_INDEXES environment variable!')
 try:
     opts.bt_index
     opts.bam_base
     opts.fastqs
 except KeyError:
     raise MissingArgumentError('missing at least one of the required command line arguments: --bt-index, --bam-base, --fastqs')        
 if not opts.override:
     opts.bt_opts = "%s %s" % (defaultBtOpts,opts.bt_opts)