def buildAlignmentSizes(infiles, outfile):
    '''
    use bed files to sum the total number of bases
    that are aligned to the genomes
    '''
    outf = open(outfile, "w")
    outf.write("genome\tsize\n")
    for infile in infiles:
        genome = P.snip(os.path.basename(infile), ".bed.gz")
        c = 0
        inf = iotools.openFile(infile)
        for bed in Bed.iterator(inf):
            c += bed.end - bed.start
        outf.write("%s\t%s\n" % (genome, str(c)))
    outf.close()
def buildAlignmentSizes(infiles, outfile):
    '''
    use bed files to sum the total number of bases
    that are aligned to the genomes
    '''
    outf = open(outfile, "w")
    outf.write("genome\tsize\n")
    for infile in infiles:
        genome = P.snip(os.path.basename(infile), ".bed.gz")
        c = 0
        inf = IOTools.openFile(infile)
        for bed in Bed.iterator(inf):
            c += bed.end - bed.start
        outf.write("%s\t%s\n" % (genome, str(c)))
    outf.close()
Esempio n. 3
0
def getOverlappingRegionDict(bd1, bd2, debug=True):
    ''' Return a dictionary of overlapping regions within two dictionaries of Bed regions '''
    retval = {}
    for chrom, bedList1 in bd1.items():
        bedList2 = bd2.get(chrom, [])

        if bedList2 == []:
            continue

        if debug:
            if not isNonContiguous(bedList1):
                errAbort(
                    "Calculating overlapping regions must have non-contiguous input elements."
                )
            if not isNonContiguous(bedList2):
                errAbort(
                    "Calculating overlapping regions must have non-contiguous input elements."
                )

        b1Len = len(bedList1)
        b1Idx = 0
        b2Len = len(bedList2)
        b2Idx = 0

        while (b1Idx < b1Len) and (b2Idx < b2Len):
            b1Curr = bedList1[b1Idx]
            b2Curr = bedList2[b2Idx]
            assert b1Curr.chrom == b2Curr.chrom
            maxStart = max(b1Curr.chromStart, b2Curr.chromStart)
            minEnd = min(b1Curr.chromEnd, b2Curr.chromEnd)
            if maxStart < minEnd:
                retval[b1Curr.chrom] = retval.get(b1Curr.chrom, []) + [
                    Bed.Bed("%s\t%d\t%d" % (b1Curr.chrom, maxStart, minEnd))
                ]

            if b1Curr.chromEnd < b2Curr.chromEnd:
                b1Idx += 1
            elif b1Curr.chromEnd > b2Curr.chromEnd:
                b2Idx += 1
            else:
                b1Idx += 1
                b2Idx += 1
    return retval
Esempio n. 4
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", 
                                    usage = globals()["__doc__"] )

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv )

    coords_file=args[0]

    bamfile=pysam.Samfile( args[1], 'rb' )  # bamfile

    options.stdout.write( "gene_id\tcounts\tlength\n" )

    iter = Bed.iterator( IOTools.openFile( coords_file ) )
    for gene_id, exons in itertools.groupby( iter, lambda x: x.name ):

        num_reads=0
        
        anames=set([])
        lgene = 0

        for bed in exons:
            lgene += bed.end - bed.start
            for alignedread in bamfile.fetch(bed.contig, bed.start, bed.end):
                anames.add((alignedread.qname, alignedread.is_read1))

        num_reads = len(anames)
        options.stdout.write( "\t".join( (gene_id,
                                          str(num_reads),
                                          str(lgene ) )) + "\n" )

    ## write footer and output benchmark information.
    E.Stop()
Esempio n. 5
0
def bedReadFromFile(fn, chromSizes=None):
    ''' Read a file of Bed regions into a list of Beds and return it '''
    f = open(fn)
    retval = [Bed.Bed(line, chromSizes=chromSizes) for line in f.readlines()]
    f.close()
    return retval
Esempio n. 6
0
def main( argv = sys.argv ):

    parser = optparse.OptionParser( version = "%prog version: $Id: bed2gff.py 2861 2010-02-23 17:36:32Z andreas $", 
                                    usage = globals()["__doc__"] )

    parser.add_option("-a", "--as-gtf", dest="as_gtf", action="store_true",
                      help="output as gtf."  )

    parser.set_defaults( as_gtf = False,
                         id_format = "%08i",
                         test = None )
    
    (options, args) = E.Start( parser, add_pipe_options = True )

    as_gtf = options.as_gtf
    id_format = options.id_format

    if as_gtf:
        gff = GTF.Entry()
    else:
        gff = GFF.Entry()

    gff.source = "bed"
    gff.feature = "exon"

    ninput, noutput, nskipped = 0, 0, 0

    id = 0
    for bed in Bed.iterator( options.stdin ):

        ninput += 1

        gff.contig = bed.contig
        gff.start = bed.start 
        gff.end = bed.end
        if bed.mFields and len(bed.mFields) >= 3:
            gff.strand = bed.mFields[2]
        else: 
            gff.strand = "."

        if bed.mFields and len(bed.mFields) >= 2:
            gff.score = bed.mFields[1]
        
        
        if as_gtf:
            if bed.mFields:
                gff.gene_id = bed.mFields[0]
                gff.transcript_id = bed.mFields[0]
            else:
                id += 1
                gff.gene_id = id_format % id
                gff.transcript_id = id_format % id            
        else:
            if bed.mFields:
                gff.source = bed.mFields[0]
            
        options.stdout.write( str(gff) + "\n" )

        noutput += 1

    E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped) )

    E.Stop()