def buildAlignmentSizes(infiles, outfile): ''' use bed files to sum the total number of bases that are aligned to the genomes ''' outf = open(outfile, "w") outf.write("genome\tsize\n") for infile in infiles: genome = P.snip(os.path.basename(infile), ".bed.gz") c = 0 inf = iotools.openFile(infile) for bed in Bed.iterator(inf): c += bed.end - bed.start outf.write("%s\t%s\n" % (genome, str(c))) outf.close()
def buildAlignmentSizes(infiles, outfile): ''' use bed files to sum the total number of bases that are aligned to the genomes ''' outf = open(outfile, "w") outf.write("genome\tsize\n") for infile in infiles: genome = P.snip(os.path.basename(infile), ".bed.gz") c = 0 inf = IOTools.openFile(infile) for bed in Bed.iterator(inf): c += bed.end - bed.start outf.write("%s\t%s\n" % (genome, str(c))) outf.close()
def getOverlappingRegionDict(bd1, bd2, debug=True): ''' Return a dictionary of overlapping regions within two dictionaries of Bed regions ''' retval = {} for chrom, bedList1 in bd1.items(): bedList2 = bd2.get(chrom, []) if bedList2 == []: continue if debug: if not isNonContiguous(bedList1): errAbort( "Calculating overlapping regions must have non-contiguous input elements." ) if not isNonContiguous(bedList2): errAbort( "Calculating overlapping regions must have non-contiguous input elements." ) b1Len = len(bedList1) b1Idx = 0 b2Len = len(bedList2) b2Idx = 0 while (b1Idx < b1Len) and (b2Idx < b2Len): b1Curr = bedList1[b1Idx] b2Curr = bedList2[b2Idx] assert b1Curr.chrom == b2Curr.chrom maxStart = max(b1Curr.chromStart, b2Curr.chromStart) minEnd = min(b1Curr.chromEnd, b2Curr.chromEnd) if maxStart < minEnd: retval[b1Curr.chrom] = retval.get(b1Curr.chrom, []) + [ Bed.Bed("%s\t%d\t%d" % (b1Curr.chrom, maxStart, minEnd)) ] if b1Curr.chromEnd < b2Curr.chromEnd: b1Idx += 1 elif b1Curr.chromEnd > b2Curr.chromEnd: b2Idx += 1 else: b1Idx += 1 b2Idx += 1 return retval
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage = globals()["__doc__"] ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv ) coords_file=args[0] bamfile=pysam.Samfile( args[1], 'rb' ) # bamfile options.stdout.write( "gene_id\tcounts\tlength\n" ) iter = Bed.iterator( IOTools.openFile( coords_file ) ) for gene_id, exons in itertools.groupby( iter, lambda x: x.name ): num_reads=0 anames=set([]) lgene = 0 for bed in exons: lgene += bed.end - bed.start for alignedread in bamfile.fetch(bed.contig, bed.start, bed.end): anames.add((alignedread.qname, alignedread.is_read1)) num_reads = len(anames) options.stdout.write( "\t".join( (gene_id, str(num_reads), str(lgene ) )) + "\n" ) ## write footer and output benchmark information. E.Stop()
def bedReadFromFile(fn, chromSizes=None): ''' Read a file of Bed regions into a list of Beds and return it ''' f = open(fn) retval = [Bed.Bed(line, chromSizes=chromSizes) for line in f.readlines()] f.close() return retval
def main( argv = sys.argv ): parser = optparse.OptionParser( version = "%prog version: $Id: bed2gff.py 2861 2010-02-23 17:36:32Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-a", "--as-gtf", dest="as_gtf", action="store_true", help="output as gtf." ) parser.set_defaults( as_gtf = False, id_format = "%08i", test = None ) (options, args) = E.Start( parser, add_pipe_options = True ) as_gtf = options.as_gtf id_format = options.id_format if as_gtf: gff = GTF.Entry() else: gff = GFF.Entry() gff.source = "bed" gff.feature = "exon" ninput, noutput, nskipped = 0, 0, 0 id = 0 for bed in Bed.iterator( options.stdin ): ninput += 1 gff.contig = bed.contig gff.start = bed.start gff.end = bed.end if bed.mFields and len(bed.mFields) >= 3: gff.strand = bed.mFields[2] else: gff.strand = "." if bed.mFields and len(bed.mFields) >= 2: gff.score = bed.mFields[1] if as_gtf: if bed.mFields: gff.gene_id = bed.mFields[0] gff.transcript_id = bed.mFields[0] else: id += 1 gff.gene_id = id_format % id gff.transcript_id = id_format % id else: if bed.mFields: gff.source = bed.mFields[0] options.stdout.write( str(gff) + "\n" ) noutput += 1 E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped) ) E.Stop()