コード例 #1
0
def main():

    # Parse command line
    options, args = doc_optparse.parse(__doc__)
    try:
        score_file = open(args[0])
        interval_file = open(args[1])
        if len(args) > 2:
            out_file = open(args[2], "w")
        else:
            out_file = sys.stdout
    except:
        doc_optparse.exit()

    scores_by_chrom = read_scores(misc.open_compressed(sys.argv[1]))
    for line in open(sys.argv[2]):
        fields = line.split()
        chrom, start, stop = fields[0], int(fields[1]), int(fields[2])
        if chrom in scores_by_chrom:
            ba = scores_by_chrom[chrom]
            scores = [ba[i] for i in range(start, stop)]
        else:
            scores = []
        print >> out_file, " ".join(fields), " ".join(map(str, scores))

    score_file.close()
    interval_file.close()
    out_file.close()
コード例 #2
0
def main():

    # Parse command line arguments
    options, args = doc_optparse.parse( __doc__ )

    try:
        lnorm = bool( options.lnorm )
        recalculate = bool( options.recalculate )
    except:
        doc_optparse.exit()

    hox70 = score.build_scoring_scheme( """  A    C    G    T
                                      91 -114  -31 -123
                                    -114  100 -125  -31
                                     -31 -125  100 -114
                                    -123  -31 -114   91 """, 400, 30, default=0 )

    maf_reader = maf.Reader( sys.stdin )

    for m in maf_reader: 
        if m.text_size == 0:
            print "NA"
            continue
        s = m.score
        # Recalculate?
        if recalculate:
            s = hox70.score_alignment( m )
        # Normalize?
        if lnorm:
            s = s / m.text_size
        # Print
        print s
コード例 #3
0
def main():

    # Parse command line arguments
    options, args = doc_optparse.parse(__doc__)

    try:
        lnorm = bool(options.lnorm)
        recalculate = bool(options.recalculate)
    except Exception:
        doc_optparse.exit()

    hox70 = score.build_scoring_scheme("""  A    C    G    T
                                      91 -114  -31 -123
                                    -114  100 -125  -31
                                     -31 -125  100 -114
                                    -123  -31 -114   91 """,
                                       400,
                                       30,
                                       default=0)

    maf_reader = maf.Reader(sys.stdin)

    for m in maf_reader:
        if m.text_size == 0:
            print("NA")
            continue
        s = m.score
        # Recalculate?
        if recalculate:
            s = hox70.score_alignment(m)
        # Normalize?
        if lnorm:
            s = s / m.text_size
        # Print
        print(s)
コード例 #4
0
def main():

    options, args = doc_optparse.parse( __doc__ )

    try:
        species = args[0].split(',')
        nrequired = int( args[1] )
    except:
        doc_optparse.exit()

    maf_reader = bx.align.maf.Reader( sys.stdin )
    
    interval_start = None
    interval_end = None   

    for m in maf_reader:            
        ref = m.components[0]
        # Does this alignment have enough of the required species        
        if nrequired <= len( [ comp for comp in m.components if comp.src.split('.')[0] in species ] ):
            if interval_start is None:
                interval_start = ref.start
                interval_end = ref.end
            else:
                if ref.start - interval_end < SPAN:
                    interval_end = ref.end
                else:
                    if interval_end - interval_start >= MIN:
                        print(ref.src.split('.')[1], interval_start, interval_end)
                    interval_start = ref.start
                    interval_end = ref.end    
        else:
            if interval_start != None and interval_end - interval_start >= MIN:
                print(ref.src.split('.')[1], interval_start, interval_end)
            interval_start = None
            interval_end = None
コード例 #5
0
def main():

    options, args = doc_optparse.parse(__doc__)

    try:
        species = args
        # Allow a comma separated list, TODO: allow a newick format tree
        if len(species) == 1 and ',' in species[0]:
            species = species[0].split(',')
        fuse = not (bool(options.nofuse))
    except:
        doc_optparse.exit()

    maf_reader = bx.align.maf.Reader(sys.stdin)
    maf_writer = bx.align.maf.Writer(sys.stdout)

    if fuse:
        maf_writer = FusingAlignmentWriter(maf_writer)

    for m in maf_reader:
        new_components = get_components_for_species(m, species)
        if new_components:
            remove_all_gap_columns(new_components)
            m.components = new_components
            m.score = 0.0
            maf_writer.write(m)

    maf_reader.close()
    maf_writer.close()
コード例 #6
0
def main():

    options, args = doc_optparse.parse(__doc__)

    try:
        species = args
        # Allow a comma separated list, TODO: allow a newick format tree
        if len(species) == 1 and "," in species[0]:
            species = species[0].split(",")
        fuse = not (bool(options.nofuse))
    except:
        doc_optparse.exit()

    maf_reader = bx.align.maf.Reader(sys.stdin)
    maf_writer = bx.align.maf.Writer(sys.stdout)

    if fuse:
        maf_writer = FusingAlignmentWriter(maf_writer)

    for m in maf_reader:
        new_components = get_components_for_species(m, species)
        if new_components:
            remove_all_gap_columns(new_components)
            m.components = new_components
            m.score = 0.0
            maf_writer.write(m)

    maf_reader.close()
    maf_writer.close()
コード例 #7
0
def main():
    
    # Parse command line
    options, args = doc_optparse.parse( __doc__ )
    try:
        if options.comp:
            comp_type = options.comp
        else:
            comp_type = None
        score_fname = args[0]
        out_fname = args[1]
    except:
        doc_optparse.exit()

    scores = BinnedArray()

    ## last_chrom = None
    for i, ( chrom, pos, val ) in enumerate( bx.wiggle.Reader( misc.open_compressed( score_fname ) ) ):
        #if last_chrom is None: 
        #    last_chrom = chrom
        #else: 
        #    assert chrom == last_chrom, "This script expects a 'wiggle' input on only one chromosome"
        scores[pos] = val
        # Status
        if i % 10000 == 0: print i, "scores processed"

    out = open( out_fname, "w" )
    if comp_type:
        scores.to_file( out, comp_type=comp_type )
    else:    
        scores.to_file( out )
    out.close()
コード例 #8
0
def main():

    # Parse command line
    options, args = doc_optparse.parse(__doc__)
    try:
        score_file = open(args[0])
        interval_file = open(args[1])
        if len(args) > 2:
            out_file = open(args[2], 'w')
        else:
            out_file = sys.stdout
    except:
        doc_optparse.exit()

    scores_by_chrom = read_scores(misc.open_compressed(sys.argv[1]))
    for line in open(sys.argv[2]):
        fields = line.split()
        chrom, start, stop = fields[0], int(fields[1]), int(fields[2])
        if chrom in scores_by_chrom:
            ba = scores_by_chrom[chrom]
            scores = [ba[i] for i in range(start, stop)]
        else:
            scores = []
        print >> out_file, " ".join(fields), " ".join(map(str, scores))

    score_file.close()
    interval_file.close()
    out_file.close()
コード例 #9
0
def main():

    # Parse command line

    options, args = doc_optparse.parse( __doc__ )

    try:
        maf_file = args[0]
        # If it appears to be a bz2 file, attempt to open with table
        if maf_file.endswith( ".bz2" ):
            table_file = maf_file + "t"
            if not os.path.exists( table_file ):
                doc_optparse.exit( "To index bz2 compressed files first "
                                   "create a bz2t file with bzip-table." )
            # Open with SeekableBzip2File so we have tell support
            maf_in = SeekableBzip2File( maf_file, table_file )
            # Strip .bz2 from the filename before adding ".index"
            maf_file = maf_file[:-4]
        elif maf_file.endswith( ".lzo" ):
            from bx.misc.seeklzop import SeekableLzopFile
            table_file = maf_file + "t"
            if not os.path.exists( table_file ):
                doc_optparse.exit( "To index lzo compressed files first "
                                   "create a lzot file with lzop_build_offset_table." )
            # Open with SeekableBzip2File so we have tell support
            maf_in = SeekableLzopFile( maf_file, table_file )
            # Strip .lzo from the filename before adding ".index"
            maf_file = maf_file[:-4]
        else:
            maf_in = open( maf_file )
        # Determine the name of the index file
        if len( args ) > 1: 
            index_file = args[1]
        else: 
            index_file = maf_file + ".index" 
        if options.species:
            species = options.species.split( "," )
        else:
            species = None
    except:
        doc_optparse.exception()

    maf_reader = bx.align.maf.Reader( maf_in )

    indexes = interval_index_file.Indexes()

    # Need to be a bit tricky in our iteration here to get the 'tells' right
    while 1:
        pos = maf_reader.file.tell()
        block = maf_reader.next()
        if block is None: break
        for c in block.components:
            if species is not None and c.src.split('.')[0] not in species:
                continue
            indexes.add( c.src, c.forward_strand_start, c.forward_strand_end, pos, max=c.src_size )

    out = open( index_file, 'w' )
    indexes.write( out )
    out.close()
コード例 #10
0
ファイル: maf_build_index.py プロジェクト: bxlab/HiFive_Paper
def main():

    # Parse command line

    options, args = doc_optparse.parse( __doc__ )

    try:
        maf_file = args[0]
        # If it appears to be a bz2 file, attempt to open with table
        if maf_file.endswith( ".bz2" ):
            table_file = maf_file + "t"
            if not os.path.exists( table_file ):
                doc_optparse.exit( "To index bz2 compressed files first "
                                   "create a bz2t file with bzip-table." )
            # Open with SeekableBzip2File so we have tell support
            maf_in = SeekableBzip2File( maf_file, table_file )
            # Strip .bz2 from the filename before adding ".index"
            maf_file = maf_file[:-4]
        elif maf_file.endswith( ".lzo" ):
            from bx.misc.seeklzop import SeekableLzopFile
            table_file = maf_file + "t"
            if not os.path.exists( table_file ):
                doc_optparse.exit( "To index lzo compressed files first "
                                   "create a lzot file with lzop_build_offset_table." )
            # Open with SeekableBzip2File so we have tell support
            maf_in = SeekableLzopFile( maf_file, table_file )
            # Strip .lzo from the filename before adding ".index"
            maf_file = maf_file[:-4]
        else:
            maf_in = open( maf_file )
        # Determine the name of the index file
        if len( args ) > 1: 
            index_file = args[1]
        else: 
            index_file = maf_file + ".index" 
        if options.species:
            species = options.species.split( "," )
        else:
            species = None
    except:
        doc_optparse.exception()

    maf_reader = bx.align.maf.Reader( maf_in )

    indexes = interval_index_file.Indexes()

    # Need to be a bit tricky in our iteration here to get the 'tells' right
    while 1:
        pos = maf_reader.file.tell()
        block = maf_reader.next()
        if block is None: break
        for c in block.components:
            if species is not None and c.src.split('.')[0] not in species:
                continue
            indexes.add( c.src, c.forward_strand_start, c.forward_strand_end, pos, max=c.src_size )

    out = open( index_file, 'w' )
    indexes.write( out )
    out.close()
コード例 #11
0
def main():

    # Parse command line
    options, args = doc_optparse.parse(__doc__)
    try:
        score_fname = args[0]
        interval_fname = args[1]
        if len(args) > 2:
            out_file = open(args[2], 'w')
        else:
            out_file = sys.stdout
        binned = bool(options.binned)
        mask_fname = options.mask
    except Exception:
        doc_optparse.exit()

    if binned:
        scores_by_chrom = load_scores_ba_dir(score_fname)
    else:
        scores_by_chrom = load_scores_wiggle(score_fname)

    if mask_fname:
        masks = binned_bitsets_from_file(open(mask_fname))
    else:
        masks = None

    for line in open(interval_fname):
        fields = line.split()
        chrom, start, stop = fields[0], int(fields[1]), int(fields[2])
        total = 0
        count = 0
        min_score = 100000000
        max_score = -100000000
        for i in range(start, stop):
            if chrom in scores_by_chrom and scores_by_chrom[chrom][i]:
                # Skip if base is masked
                if masks and chrom in masks:
                    if masks[chrom][i]:
                        continue
                # Get the score, only count if not 'nan'
                score = scores_by_chrom[chrom][i]
                if not isNaN(score):
                    total += score
                    count += 1
                    max_score = max(score, max_score)
                    min_score = min(score, min_score)
        if count > 0:
            avg = total / count
        else:
            avg = "nan"
            min_score = "nan"
            max_score = "nan"

        print("\t".join(
            map(str, [chrom, start, stop, avg, min_score, max_score])),
              file=out_file)

    out_file.close()
コード例 #12
0
def main():

    # Parse command line
    options, args = doc_optparse.parse(__doc__)
    try:
        score_fname = args[0]
        interval_fname = args[1]
        if len(args) > 2:
            out_file = open(args[2], "w")
        else:
            out_file = sys.stdout
        binned = bool(options.binned)
        mask_fname = options.mask
    except:
        doc_optparse.exit()

    if binned:
        scores_by_chrom = load_scores_ba_dir(score_fname)
    else:
        scores_by_chrom = load_scores_wiggle(score_fname)

    if mask_fname:
        masks = binned_bitsets_from_file(open(mask_fname))
    else:
        masks = None

    for line in open(interval_fname):
        fields = line.split()
        chrom, start, stop = fields[0], int(fields[1]), int(fields[2])
        total = 0
        count = 0
        min_score = 100000000
        max_score = -100000000
        for i in range(start, stop):
            if chrom in scores_by_chrom and scores_by_chrom[chrom][i]:
                # Skip if base is masked
                if masks and chrom in masks:
                    if masks[chrom][i]:
                        continue
                # Get the score, only count if not 'nan'
                score = scores_by_chrom[chrom][i]
                if not isNaN(score):
                    total += score
                    count += 1
                    max_score = max(score, max_score)
                    min_score = min(score, min_score)
        if count > 0:
            avg = total / count
        else:
            avg = "nan"
            min_score = "nan"
            max_score = "nan"

        print >> out_file, "\t".join(map(str, [chrom, start, stop, avg, min_score, max_score]))

    out_file.close()
コード例 #13
0
def __main__():

    # Parse Command Line

    options, args = doc_optparse.parse(__doc__)

    try:
        maf_files = args
        if options.prefix: prefix = options.prefix
        else: prefix = None
    except:
        doc_optparse.exit()

    # Open indexed access to mafs
    indexes = [
        bx.align.maf.Indexed(maf_file, maf_file + ".index")
        for maf_file in maf_files
    ]

    # Iterate over input ranges

    for line in sys.stdin:
        fields = line.split()
        src, start, end = fields[0], int(fields[1]), int(fields[2])
        if prefix: src = prefix + src

        total_length = end - start

        # Find overlap with reference component
        blocks = []
        for index in indexes:
            blocks += index.get(src, start, end)

        coverage = dict()
        for block in blocks:
            overlap_start = max(start, block.components[0].start)
            overlap_end = min(end, block.components[0].end)
            length = overlap_end - overlap_start
            assert length > 0
            for c in block.components[1:]:
                species = c.src.split('.')[0]
                try:
                    coverage[species] += length
                except:
                    coverage[species] = length

        print(line, end=' ')
        for key, value in coverage.items():
            print("   ", key.ljust(10), "%0.2f" % (value / total_length))
コード例 #14
0
def __main__():

    # Parse Command Line

    options, args = doc_optparse.parse(__doc__)

    try:
        range_filename = args[0]
        refindex = int(args[1])
        if options.mincols: mincols = int(options.mincols)
        else: mincols = 10
    except:
        doc_optparse.exit()

    # Load Intervals

    intersecter = intervals.Intersecter()
    for line in file(range_filename):
        fields = line.split()
        intersecter.add_interval(
            intervals.Interval(int(fields[0]), int(fields[1])))

    # Start axt on stdout

    out = bx.align.axt.Writer(sys.stdout)

    # Iterate over input axt

    for axt in bx.align.axt.Reader(sys.stdin):
        ref_component = axt.components[refindex]
        # Find overlap with reference component
        intersections = intersecter.find(ref_component.start,
                                         ref_component.end)
        # Keep output axt ordered
        intersections.sort()
        # Write each intersecting block
        for interval in intersections:
            start = max(interval.start, ref_component.start)
            end = min(interval.end, ref_component.end)
            sliced = axt.slice_by_component(refindex, start, end)
            good = True
            for c in sliced.components:
                if c.size < 1:
                    good = False
            if good and sliced.text_size > mincols: out.write(sliced)

    # Close output axt

    out.close()
コード例 #15
0
ファイル: maf_print_chroms.py プロジェクト: CS76/ipo-galaxy
def __main__():

    # Parse command line arguments
    options, args = doc_optparse.parse( __doc__ )

    try:
        refindex = int( args[0] )
    except:
        doc_optparse.exit()

    maf_reader = maf.Reader( sys.stdin )

    for m in maf_reader: 
		c = m.components[ refindex ].src
		print c[ c.rfind( "chr" ) + 3 : ]
コード例 #16
0
def __main__():

    # Parse command line arguments
    options, args = doc_optparse.parse(__doc__)

    try:
        refindex = int(args[0])
    except Exception:
        doc_optparse.exit()

    maf_reader = maf.Reader(sys.stdin)

    for m in maf_reader:
        c = m.components[refindex].src
        print(c[c.rfind("chr") + 3:])
コード例 #17
0
ファイル: axt_extract_ranges.py プロジェクト: CS76/ipo-galaxy
def __main__():

	# Parse Command Line

	options, args = doc_optparse.parse( __doc__ )

	try:
		range_filename = args[ 0 ]
		refindex = int( args[ 1 ] )
		if options.mincols: mincols = int( options.mincols )
		else: mincols = 10
	except:
		doc_optparse.exit()

	# Load Intervals

	intersecter = intervals.Intersecter()
	for line in file( range_filename ):
		fields = line.split()
		intersecter.add_interval( intervals.Interval( int( fields[0] ), int( fields[1] ) ) )

	# Start axt on stdout

	out = bx.align.axt.Writer( sys.stdout )

	# Iterate over input axt

	for axt in bx.align.axt.Reader( sys.stdin ):
		ref_component = axt.components[ refindex ]
		# Find overlap with reference component
		intersections = intersecter.find( ref_component.start, ref_component.end )
		# Keep output axt ordered
		intersections.sort()
		# Write each intersecting block
		for interval in intersections: 
			start = max( interval.start, ref_component.start )
			end = min( interval.end, ref_component.end )
			sliced = axt.slice_by_component( refindex, start, end ) 
			good = True
			for c in sliced.components: 
				if c.size < 1: 
					good = False
			if good and sliced.text_size > mincols: out.write( sliced )
		 
	# Close output axt

	out.close()
コード例 #18
0
def __main__():

    options, args = doc_optparse.parse( __doc__ )

    try:
        range_file = file( args[0] )
        nib_file = file( args[1] )
    except:
        doc_optparse.exit()

    nib = bx.seq.nib.NibFile( nib_file )

    for line in range_file: 
        fields = line.split()
        start, end = int( fields[0] ), int( fields[1] ) 
        print ">", start, end 
        print_wrapped( nib.get( start, end - start ) )
コード例 #19
0
def __main__():

    options, args = doc_optparse.parse(__doc__)

    try:
        range_file = file(args[0])
        nib_file = file(args[1])
    except:
        doc_optparse.exit()

    nib = bx.seq.nib.NibFile(nib_file)

    for line in range_file:
        fields = line.split()
        start, end = int(fields[0]), int(fields[1])
        print ">", start, end
        print_wrapped(nib.get(start, end - start))
コード例 #20
0
ファイル: maf_count.py プロジェクト: springtan/bx-python
def __main__():
    options, args = doc_optparse.parse(__doc__)

    try:
        if options.cols:
            action = "cols"
        elif options.bases:
            action = "bases"
        else:
            action = "aligns"
        print_each = bool(options.each)
        if options.ref:
            ref = int(options.ref)
        else:
            ref = 0
        if options.skip:
            skip = options.skip
        else:
            skip = None
    except Exception:
        doc_optparse.exit()

    maf_reader = bx.align.maf.Reader(sys.stdin)

    count = 0

    for m in maf_reader:

        if action == "aligns":
            count += 1
        elif action == "cols":
            count += m.text_size
        elif action == "bases":
            if skip:
                count += (m.components[ref].size -
                          m.components[ref].text.count(skip))
            else:
                count += m.components[ref].size

        if print_each:
            print(count)
            count = 0

    if not print_each:
        print(count)
コード例 #21
0
def __main__():

    # Parse Command Line

    options, args = doc_optparse.parse( __doc__ )

    try:
        maf_files = args
        if options.prefix: prefix = options.prefix
        else: prefix = None
    except:
        doc_optparse.exit()

    # Open indexed access to mafs
    indexes = [ bx.align.maf.Indexed( maf_file, maf_file + ".index" ) for maf_file in maf_files ]

    # Iterate over input ranges 

    for line in sys.stdin:
        fields = line.split()
        src, start, end = fields[0], int( fields[1] ), int( fields[2] )
        if prefix: src = prefix + src

        total_length = end - start

        # Find overlap with reference component
        blocks = []
        for index in indexes: blocks += index.get( src, start, end )

        coverage = dict()
        for block in blocks:
            overlap_start = max( start, block.components[0].start )
            overlap_end = min( end, block.components[0].end )
            length = overlap_end - overlap_start
            assert length > 0
            for c in block.components[1:]:
                species = c.src.split( '.' )[0]
                try: coverage[ species ] += length
                except: coverage[ species ] = length

        print(line, end=' ')
        for key, value in coverage.items():
            print("   ", key.ljust(10), "%0.2f" % ( value / total_length ))
コード例 #22
0
def __main__():

    options, args = doc_optparse.parse( __doc__ )

    try:
        nib_dir = args[0] 
    except:
        doc_optparse.exit()

    nibs = {}

    for line in sys.stdin: 
        fields = line.split()
        chrom, start, end = fields[0], int( fields[1] ), int( fields[2] ) 
        print ">", chrom, start, end 
        if chrom in nibs:
            nib = nibs[chrom]
        else:
            nibs[chrom] = nib = bx.seq.nib.NibFile( file( "%s/%s.nib" % ( nib_dir, chrom ) ) )
        print_wrapped( nib.get( start, end - start ) )
コード例 #23
0
def __main__():

    options, args = doc_optparse.parse( __doc__ )

    try:
        nib_dir = args[0] 
    except:
        doc_optparse.exit()

    nibs = {}

    for line in sys.stdin: 
        fields = line.split()
        chrom, start, end = fields[0], int( fields[1] ), int( fields[2] ) 
        print ">", chrom, start, end 
        if chrom in nibs:
            nib = nibs[chrom]
        else:
            nibs[chrom] = nib = bx.seq.nib.NibFile( file( "%s/%s.nib" % ( nib_dir, chrom ) ) )
        print_wrapped( nib.get( start, end - start ) )
コード例 #24
0
def main():
    
    # Parse command line
    options, args = doc_optparse.parse( __doc__ )
    try:
        score_fname = args[0]
    except:
        doc_optparse.exit()

    scores = {}
    for i, ( chrom, pos, val ) in enumerate( bx.wiggle.Reader( open(sys.argv[1]) ) ):
        if not chrom in scores: scores[ chrom ] = BinnedArray()
        scores[chrom][pos] = val

        # Status
        if i % 10000 == 0: print i, "scores processed"

    for chr in scores.keys():
        out = open( chr, "w" )
        scores[chr].to_file( out )
        out.close()
コード例 #25
0
def main():
    
    # Parse command line
    options, args = doc_optparse.parse( __doc__ )
    try:
        score_fname = args[0]
    except:
        doc_optparse.exit()

    scores = {}
    for i, ( chrom, pos, val ) in enumerate( bx.wiggle.Reader( open(sys.argv[1]) ) ):
        if not chrom in scores: scores[ chrom ] = BinnedArray()
        scores[chrom][pos] = val

        # Status
        if i % 10000 == 0: print i, "scores processed"

    for chr in scores.keys():
        out = open( chr, "w" )
        scores[chr].to_file( out )
        out.close()
コード例 #26
0
def __main__():

    # Parse Command Line

    options, args = doc_optparse.parse(__doc__)

    try:
        assert len(args) > 0
    except:
        doc_optparse.exit()

    # Load Intervals

    intersector = intervals.Intersecter()

    for f in args:
        for line in file(f):
            if line.startswith("#") or line.isspace(): continue
            fields = line.split()
            intersector.add_interval(
                intervals.Interval(int(fields[0]), int(fields[1])))

    # Start MAF on stdout

    out = bx.align.maf.Writer(sys.stdout)

    # Iterate over input MAF

    for maf in bx.align.maf.Reader(sys.stdin):
        # Find overlap with reference component
        intersections = intersector.find(maf.components[0].start,
                                         maf.components[0].end)
        # Write only if no overlap
        if len(intersections) == 0:
            out.write(maf)

    # Close output MAF

    out.close()
コード例 #27
0
ファイル: maf_count.py プロジェクト: CS76/ipo-galaxy
def __main__():

    options, args = doc_optparse.parse( __doc__ )

    try:
        if options.cols: action = "cols"
        elif options.bases: action = "bases"
        else: action = "aligns"
        print_each = bool( options.each )
        if options.ref: ref = int( options.ref )
        else: ref = 0
        if options.skip: skip = options.skip
        else: skip = None
    except:
        doc_optparse.exit()

    maf_reader = bx.align.maf.Reader( sys.stdin )

    count = 0

    for m in maf_reader:
        
        if action == "aligns": 
            count += 1
        elif action == "cols": 
            count += m.text_size
        elif action == "bases":
            if skip:
                count += ( m.components[ref].size - m.components[ref].text.count( skip ) )
            else:
                count += m.components[ref].size

        if print_each: 
            print count
            count = 0

    if not print_each: print count
コード例 #28
0
def __main__():

    # Parse Command Line

    options, args = doc_optparse.parse( __doc__ )

    try:
        assert len( args ) > 0
    except:
        doc_optparse.exit()

    # Load Intervals

    intersector = intervals.Intersecter()

    for f in args:
        for line in file( f ):
            if line.startswith( "#" ) or line.isspace(): continue
            fields = line.split()
            intersector.add_interval( intervals.Interval( int( fields[0] ), int( fields[1] ) ) )

    # Start MAF on stdout

    out = bx.align.maf.Writer( sys.stdout )

    # Iterate over input MAF

    for maf in bx.align.maf.Reader( sys.stdin ):
        # Find overlap with reference component
        intersections = intersector.find( maf.components[0].start, maf.components[0].end )
        # Write only if no overlap
        if len( intersections ) == 0:
            out.write( maf )
         
    # Close output MAF

    out.close()
コード例 #29
0
def main():
    # Parse Command Line
    options, args = doc_optparse.parse( __doc__ )
    if options.version: return

    try:
        wiggle_files = args
        if options.src: fixed_src = options.src
        else: fixed_src = None
        if options.prefix: prefix = options.prefix
        else: prefix = None
        do_strand = bool( options.strand )
        use_cache = bool( options.usecache )
    except:
        doc_optparse.exit()

    # Open indexed access to wiggles
    index = Wiggle.WiggleMultiIndexedAccess( wiggle_files,
                                             keep_open = True,
                                             use_cache=use_cache )

    for line in sys.stdin:
        strand = "+"
        fields = line.split()
        if fixed_src:
            src, start, end = fixed_src, int( fields[0] ), int( fields[1] )
            if do_strand: strand = fields[2]
        else:
            src, start, end = fields[0], int( fields[1] ), int( fields[2] )
            if do_strand: strand = fields[3]
        if prefix: src = prefix + src
        blocks = index.get( src, start, end )
        for x, values in blocks:
            for v in values:
                print "%s\t%i\t%f" % (src, x, v)
                x += 1
コード例 #30
0
def main():
    # Parse Command Line
    options, args = doc_optparse.parse(__doc__)
    if options.version: return

    try:
        wiggle_files = args
        if options.src: fixed_src = options.src
        else: fixed_src = None
        if options.prefix: prefix = options.prefix
        else: prefix = None
        do_strand = bool(options.strand)
        use_cache = bool(options.usecache)
    except:
        doc_optparse.exit()

    # Open indexed access to wiggles
    index = Wiggle.WiggleMultiIndexedAccess(wiggle_files,
                                            keep_open=True,
                                            use_cache=use_cache)

    for line in sys.stdin:
        strand = "+"
        fields = line.split()
        if fixed_src:
            src, start, end = fixed_src, int(fields[0]), int(fields[1])
            if do_strand: strand = fields[2]
        else:
            src, start, end = fields[0], int(fields[1]), int(fields[2])
            if do_strand: strand = fields[3]
        if prefix: src = prefix + src
        blocks = index.get(src, start, end)
        for x, values in blocks:
            for v in values:
                print "%s\t%i\t%f" % (src, x, v)
                x += 1
コード例 #31
0
def main():

    # Parse command line

    options, args = doc_optparse.parse(__doc__)
    if options.version: return

    try:
        wiggle_file = args[0]
        # If it appears to be a bz2 file, attempt to open with table
        if wiggle_file.endswith(".bz2"):
            table_file = wiggle_file + "t"
            if not os.path.exists(table_file):
                doc_optparse.exit("To index bz2 compressed files first "
                                  "create a bz2t file with bzip-table.")
            # Open with SeekableBzip2File so we have tell support
            wiggle_in = SeekableBzip2File(wiggle_file, table_file)
            # Strip .bz2 from the filename before adding ".index"
            wiggle_file = wiggle_file[:-4]
        elif wiggle_file.endswith(".lzo"):
            from bx.misc.seeklzop import SeekableLzopFile
            table_file = wiggle_file + "t"
            if not os.path.exists(table_file):
                doc_optparse.exit("To index lzo compressed files first "
                                  "create a lzot file with bzip-table.")
            # Open with SeekableBzip2File so we have tell support
            wiggle_in = SeekableLzopFile(wiggle_file, table_file)
            # Strip .lzo from the filename before adding ".index"
            wiggle_file = wiggle_file[:-4]
        else:
            wiggle_in = open(wiggle_file)
        # Determine the name of the index file
        if len(args) > 1:
            index_file = args[1]
        else:
            index_file = wiggle_file + ".index"
    except:
        doc_optparse.exception()

    indexes = interval_index_file.Indexes()

    # Can't use the iterator, as there is no next() and thus
    # no way to access the positions. The following code is
    # modified from wiggle.py
    last_chrom = None
    start = None
    end = None
    first_pos = None

    # always for wiggle data
    strand = '+'

    mode = "bed"

    while 1:
        pos = wiggle_in.tell()
        line = wiggle_in.readline()
        if not line: break

        if line.isspace() or line.startswith("track") or line.startswith(
                "#") or line.startswith("browser"):
            continue
        elif line.startswith("bed"):
            indexes.add(fields[0], int(fields[1]), int(fields[2]), pos)
        elif line.startswith("variableStep") or line.startswith("fixedStep"):
            if first_pos != None:
                indexes.add(last_chrom, start, end, first_pos)
            first_pos = pos
            header = bx.wiggle.parse_header(line)
            last_chrom = header['chrom']
            start = int(header['start']) - 1
            end = start
            current_step = None
            if 'span' in header:
                current_span = int(header['span'])
            else:
                current_span = 1
            if 'step' in header:
                current_step = int(header['step'])

            if line.startswith("variableStep"):
                mode = "variableStep"
            else:
                mode = "fixedStep"
        elif mode == "variableStep":
            fields = line.split()
            end = int(fields[0]) - 1 + current_span
        elif mode == "fixedStep":
            end += current_step
        else:
            raise "Unexpected input line: %s" % line.strip()

    out = open(index_file, 'w')
    indexes.write(out)
    out.close()
コード例 #32
0

def read_len(f):
    """Read a 'LEN' file and return a mapping from chromosome to length"""
    mapping = dict()
    for line in f:
        fields = line.split()
        mapping[fields[0]] = int(fields[1])
    return mapping


options, args = doc_optparse.parse(__doc__)
try:
    in_fname, len_fname = args
except:
    doc_optparse.exit()

bitsets = binned_bitsets_from_file(open(in_fname))

lens = read_len(open(len_fname))

for chrom in lens:
    if chrom in bitsets:
        bits = bitsets[chrom]
        bits.invert()
        len = lens[chrom]
        end = 0
        while 1:
            start = bits.next_set(end)
            if start == bits.size: break
            end = bits.next_clear(start)
コード例 #33
0
def main():
    # Parse Command Line
    options, args = doc_optparse.parse(__doc__)
    try:
        maf_files = args
        species = options.species.split(",")
        prefix = options.prefix
        use_cache = bool(options.usecache)
        if not prefix:
            prefix = ""
    except:
        doc_optparse.exit()
    # Open indexed access to mafs
    index = bx.align.maf.MultiIndexed(maf_files,
                                      parse_e_rows=True,
                                      use_cache=use_cache)
    # Print header
    print "#chr", "start", "end",
    for s in species:
        print s,
    print
    # Iterate over input ranges
    for line in sys.stdin:
        fields = line.split()
        # Input is BED3+
        chr, start, end = fields[0], int(fields[1]), int(fields[2])
        length = end - start
        assert length > 0, "Interval has length less than one"
        # Prepend prefix if specified
        src = prefix + chr
        # Keep a bitset for each species noting covered pieces
        aligned_bits = []
        missing_bits = []
        for s in species:
            aligned_bits.append(zeros(length, dtype=bool))
            missing_bits.append(zeros(length, dtype=bool))
        # Find overlap with reference component
        blocks = index.get(src, start, end)
        # Determine alignability for each position
        for block in blocks:
            # Determine the piece of the human interval this block covers,
            # relative to the start of the interval of interest
            ref = block.get_component_by_src(src)
            assert ref.strand == "+", \
                "Reference species blocks must be on '+' strand"
            rel_start = max(start, ref.start) - start
            rel_end = min(end, ref.end) - start
            # Check alignability for each species
            for i, s in enumerate(species):
                other = block.get_component_by_src_start(s)
                # Species does not appear at all indicates unaligned (best we
                # can do here?)
                if other is None:
                    continue
                # An empty component might indicate missing data, all other
                # cases (even contiguous) we count as not aligned
                if other.empty:
                    if other.synteny_empty == bx.align.maf.MAF_MISSING_STATUS:
                        missing_bits[i][rel_start:rel_end] = True
                # Otherwise we have a local alignment with some text, call
                # it aligned
                else:
                    aligned_bits[i][rel_start:rel_end] = True
        # Now determine the total alignment coverage of each interval
        print chr, start, end,
        for i, s in enumerate(species):
            aligned = sum(aligned_bits[i])
            missing = sum(missing_bits[i])
            # An interval will be called missing if it is < 100bp and <50%
            # present, or more than 100bp and less that 50bp present (yes,
            # arbitrary)
            is_missing = False
            if length < 100 and missing > (length / 2):
                print "NA",
            elif length >= 100 and missing > 50:
                print "NA",
            else:
                print aligned / (length - missing),

        print

    # Close MAF files
    index.close()
コード例 #34
0
from bx.bitset_builders import binned_bitsets_from_file
from bx.cookbook import doc_optparse

def print_bits_as_bed( bits ):
    end = 0
    while 1:
        start = bits.next_set( end )
        if start == bits.size: break
        end = bits.next_clear( start )
        print "%s\t%d\t%d" % ( chrom, start, end )

options, args = doc_optparse.parse( __doc__ )
try:
    in_fname, in2_fname = args
except:
    doc_optparse.exit()

# Read first bed into some bitsets

bitsets1 = binned_bitsets_from_file( open( in_fname ) )
bitsets2 = binned_bitsets_from_file( open( in2_fname ) )

for chrom in bitsets1:  
    if chrom not in bitsets1:
        continue
    bits1 = bitsets1[chrom]
    if chrom in bitsets2:
        bits2 = bitsets2[chrom]
        bits2.invert()
        bits1.iand( bits2 )
    print_bits_as_bed( bits1 )
コード例 #35
0
ファイル: div_snp_table_chr.py プロジェクト: zhouyu/bx-python
def main():

    options, args = doc_optparse.parse(__doc__)
    try:
        lens = {}
        if options.lens:
            for line in open(options.lens):
                chrom, length = line.split()
                lens[chrom] = int(length)

        if options.suffix: suffix = options.suffix
        else: suffix = ""

        print("\nReading feature", end=' ', file=sys.stderr)
        interval_file = open(args[0])
        feature = binned_bitsets_from_file(interval_file, lens=lens)
        interval_file.close()
        # reuse interval file
        intervals = {}
        interval_file = open(args[0])
        for line in interval_file:
            fields = line.split()
            chrom, start, end = fields[0], int(fields[1]), int(fields[2])
            if chrom not in intervals: intervals[chrom] = []
            intervals[chrom].append([start, end])
        interval_file.close()

        print("\nReading ar", end=' ', file=sys.stderr)
        ar = binned_bitsets_from_file(open(args[1]), lens=lens)

        print("\nReading snps", end=' ', file=sys.stderr)
        snp = binned_bitsets_from_file(open(args[2]), lens=lens)
        snp_mask = clone_inverted(snp)
        snp_copy = clone(snp)

        print("\nMasking AR", end=' ', file=sys.stderr)
        ar_mask = clone_inverted(ar)
        print(file=sys.stderr)

        dirname = args[3]

        if options.mask:
            mask = binned_bitsets_from_file(open(options.mask), lens=lens)
        else:
            mask = None
    except:
        doc_optparse.exit()

    if mask:
        for chrom in mask.keys():
            if chrom in feature: feature[chrom].iand(mask[chrom])
            if chrom in ar: ar[chrom].iand(mask[chrom])

    # divergence and snp counts for all features
    feature_div_count = 0
    feature_snp_count = 0
    ar_div_count = 0
    ar_snp_count = 0

    # collect snp and div
    for chr in feature.keys():

        if chr not in snp: continue
        if chr not in ar: continue

        print("reading %s ..." % chr, end=' ', file=sys.stderr)
        try:
            div = binned_bitsets_from_file(open(dirname + "/%s.bed" %
                                                (chr + suffix)),
                                           lens=lens)
        except:
            print("%s.bed not found" % chr, file=sys.stderr)
            continue

        div[chr].iand(snp_mask[chr])  # div/snp sites count snp-only
        div_copy = clone(div)

        print("AR:", chr, end=' ', file=sys.stderr)
        snp[chr].iand(ar[chr])
        div[chr].iand(ar[chr])
        snp_count = snp[chr].count_range(0, snp[chr].size)
        ar_snp_count += snp_count
        print(snp_count, end=' ', file=sys.stderr)
        try:
            div_count = div[chr].count_range(0, div[chr].size)
            ar_div_count += div_count
            print(div_count, file=sys.stderr)
        except:
            print(chr, "failed", file=sys.stderr)

        div = div_copy
        snp[chr] = snp_copy[chr]
        print("feature:", chr, end=' ', file=sys.stderr)
        feature[chr].iand(ar_mask[chr])  # clip to non-AR only
        snp[chr].iand(feature[chr])
        div[chr].iand(feature[chr])
        feature_snp_count += snp[chr].count_range(0, snp[chr].size)
        print(snp[chr].count_range(0, snp[chr].size),
              div[chr].count_range(0, div[chr].size),
              file=sys.stderr)
        feature_div_count += div[chr].count_range(0, div[chr].size)
        print(snp[chr].count_range(0, snp[chr].size),
              div[chr].count_range(0, div[chr].size),
              file=sys.stderr)

        # Note: can loop over feature intervals here for individual counts
        if chr in intervals:
            for start, end in intervals[chr]:
                ind_div_count = div[chr].count_range(start, end - start)
                ind_snp_count = snp[chr].count_range(start, end - start)
                print(chr, start, end, ind_div_count, ind_snp_count)

    print("feature snp\t%d" % feature_snp_count)
    print("feature div\t%d" % feature_div_count)
    print("ar snp\t%d" % ar_snp_count)
    print("ar div\t%d" % ar_div_count)
コード例 #36
0
def main():
    # Parse Command Line
    options, args = doc_optparse.parse( __doc__ )
    try:
        maf_files = args
        if options.mincols: mincols = int( options.mincols )
        else: mincols = 0
        if options.src: fixed_src = options.src
        else: fixed_src = None
        if options.prefix: prefix = options.prefix
        else: prefix = None
        if options.dir: dir = options.dir
        else: dir = None
        chop = bool( options.chop )
        do_strand = bool( options.strand )
        use_cache = bool( options.usecache )
    except:
        doc_optparse.exit()
    # Open indexed access to mafs
    index = bx.align.maf.MultiIndexed( maf_files, keep_open=True,
                                                  parse_e_rows=True,
                                                  use_cache=use_cache )
    # Start MAF on stdout
    if dir is None: 
        out = bx.align.maf.Writer( sys.stdout )
    # Iterate over input ranges 
    for line in sys.stdin:
        strand = None
        fields = line.split()
        if fixed_src:
            src, start, end = fixed_src, int( fields[0] ), int( fields[1] )
            if do_strand: strand = fields[2]
        else:
            src, start, end = fields[0], int( fields[1] ), int( fields[2] )
            if do_strand: strand = fields[3]
        if prefix: src = prefix + src
        # Find overlap with reference component
        blocks = index.get( src, start, end )
        # Open file if needed
        if dir:
            out = bx.align.maf.Writer( open( os.path.join( dir, "%s:%09d-%09d.maf" % ( src, start, end ) ), 'w' ) )
        # Write each intersecting block
        if chop:
            for block in blocks: 
                for ref in block.get_components_by_src( src ):
                    slice_start = max( start, ref.get_forward_strand_start() )
                    slice_end = min( end, ref.get_forward_strand_end() )
                    if (slice_end <= slice_start): continue
                    sliced = block.slice_by_component( ref, slice_start, slice_end ) 
                    # If the block is shorter than the minimum allowed size, stop
                    if mincols and ( sliced.text_size < mincols ):
                        continue
                    # If the reference component is empty, don't write the block
                    if sliced.get_component_by_src( src ).size < 1:
                        continue
                    # Keep only components that are not empty
                    sliced.components = [ c for c in sliced.components if c.size > 0 ]
                    # Reverse complement if needed
                    if ( strand != None ) and ( ref.strand != strand ): 
                        sliced = sliced.reverse_complement()
                    # Write the block
                    out.write( sliced )
        else:
            for block in blocks:
                out.write( block )
        if dir:
            out.close()
    # Close output MAF
    out.close()
    index.close()
コード例 #37
0
def main():

    options, args = doc_optparse.parse( __doc__ )
    try:
        if options.outfile: 
            out = open( options.outfile, "w")
        else:
            out = sys.stdout
        if options.format:
            format = options.format
        else:
            format = 'bed'

        allpositions = bool( options.allpositions )
        include_name = bool( options.include_name )
        nibdir = args[0]
        bedfile = args[1]
    except:
        doc_optparse.exit()

    nibs = getnib(nibdir)

    for chrom, strand, cds_exons, name in CDSReader( open(bedfile), format=format):

        cds_seq = ''

        # genome_seq_index maps the position in CDS to position on the genome
        genome_seq_index = []
        for (c_start, c_end) in cds_exons:
            cds_seq += nibs[chrom].get( c_start, c_end-c_start )
            for i in range(c_start,c_end):
                genome_seq_index.append(i)

        cds_seq = cds_seq.upper()

        if strand == '+': 
            frsts = range( 0, len(cds_seq), 3)
            offsign = 1
        else: 
            cds_seq = Comp( cds_seq )
            frsts = range( 2, len(cds_seq), 3)
            offsign = -1

        offone = 1 * offsign
        offtwo = 2 * offsign

        all = ['A','C','G','T']

        for first_pos in frsts:
            c1 = first_pos
            c2 = first_pos + offone
            c3 = first_pos + offtwo
            try:
                assert c3 < len(cds_seq)
            except AssertionError:
                print >>sys.stderr, "out of sequence at %d for %s, %d" % (c3, chrom, genome_seq_index[ first_pos ])
                continue
            codon = cds_seq[c1], cds_seq[c2], cds_seq[c3]
            aa = translate( codon, GEN_CODE )
            degeneracy3 = str(GEN_CODE[ codon[0] ][ codon[1] ].values().count(aa)) + "d"

            if not include_name: name_text = ''
            else: 
                name_text = name.replace(' ','_')

            if allpositions:
                try:
                    degeneracy1 = str([GEN_CODE[ k ][ codon[1] ][ codon[2] ] for k in all].count(aa)) + "d"
                    degeneracy2 = str([GEN_CODE[ codon[0] ][ k ][ codon[2] ] for k in all].count(aa)) + "d"
                except TypeError, s:
                    print >>sys.stderr, GEN_CODE.values()
                    raise TypeError, s

                if strand == '+':
                    print >>out, chrom, genome_seq_index[c1], genome_seq_index[c1] + 1, cds_seq[c1], degeneracy1, aa, name_text
                    print >>out, chrom, genome_seq_index[c2], genome_seq_index[c2] + 1, cds_seq[c2], degeneracy2, aa, name_text
                    print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text
                else:
                    print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text
                    print >>out, chrom, genome_seq_index[c2], genome_seq_index[c2] + 1, cds_seq[c2], degeneracy2, aa, name_text
                    print >>out, chrom, genome_seq_index[c1], genome_seq_index[c1] + 1, cds_seq[c1], degeneracy1, aa, name_text
            else:
                if strand == '+':
                    for b in c1,c2:
                        print >>out, chrom, genome_seq_index[b], genome_seq_index[b] + 1, cds_seq[b], "1d", aa, name_text
                    print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text
                else:
                    print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text
                    for b in c2,c1:
                        print >>out, chrom, genome_seq_index[b], genome_seq_index[b] + 1, cds_seq[b], "1d", aa, name_text
コード例 #38
0
def main():

    # Parse command line

    options, args = doc_optparse.parse( __doc__ )
    if options.version: return

    try:
        wiggle_file = args[0]
        # If it appears to be a bz2 file, attempt to open with table
        if wiggle_file.endswith( ".bz2" ):
            table_file = wiggle_file + "t"
            if not os.path.exists( table_file ):
                doc_optparse.exit( "To index bz2 compressed files first "
                                   "create a bz2t file with bzip-table." )
            # Open with SeekableBzip2File so we have tell support
            wiggle_in = SeekableBzip2File( wiggle_file, table_file )
            # Strip .bz2 from the filename before adding ".index"
            wiggle_file = wiggle_file[:-4]
        elif wiggle_file.endswith( ".lzo" ):
            from bx.misc.seeklzop import SeekableLzopFile
            table_file = wiggle_file + "t"
            if not os.path.exists( table_file ):
                doc_optparse.exit( "To index lzo compressed files first "
                                   "create a lzot file with bzip-table." )
            # Open with SeekableBzip2File so we have tell support
            wiggle_in = SeekableLzopFile( wiggle_file, table_file )
            # Strip .lzo from the filename before adding ".index"
            wiggle_file = wiggle_file[:-4]
        else:
            wiggle_in = open( wiggle_file )
        # Determine the name of the index file
        if len( args ) > 1:
            index_file = args[1]
        else:
            index_file = wiggle_file + ".index"
    except:
        doc_optparse.exception()

    indexes = interval_index_file.Indexes()

    # Can't use the iterator, as there is no next() and thus
    # no way to access the positions. The following code is 
    # modified from wiggle.py
    last_chrom = None
    start = None
    end = None
    first_pos = None

    # always for wiggle data
    strand = '+'

    mode = "bed"

    while 1:
        pos = wiggle_in.tell()
        line = wiggle_in.readline()
        if not line: break

        if line.isspace() or line.startswith( "track" ) or line.startswith( "#" ) or line.startswith( "browser" ):
            continue
        elif line.startswith( "bed" ):
            indexes.add( fields[0], int( fields[1] ), int( fields[2] ), pos )
        elif line.startswith( "variableStep" ) or line.startswith( "fixedStep"):
            if first_pos != None:
                indexes.add( last_chrom, start, end, first_pos )
            first_pos = pos
            header = bx.wiggle.parse_header( line )
            last_chrom = header['chrom']
            start = int(header['start']) - 1
            end = start
            current_step = None
            if 'span' in header: 
                current_span = int( header['span'] )
            else: 
                current_span = 1
            if 'step' in header: 
                current_step = int( header['step'] )

            if line.startswith( "variableStep" ):
                mode = "variableStep"
            else:
                mode = "fixedStep"
        elif mode == "variableStep":
            fields = line.split()
            end = int( fields[0] ) - 1 + current_span
        elif mode == "fixedStep":
            end += current_step
        else:
            raise "Unexpected input line: %s" % line.strip()

    out = open( index_file, 'w' )
    indexes.write( out )
    out.close()
コード例 #39
0
def main():
    # Parse Command Line
    options, args = doc_optparse.parse( __doc__ )
    try:
        maf_files = args
        species = options.species.split( "," )
        prefix = options.prefix
        use_cache = bool( options.usecache )
        if not prefix:
            prefix = ""
    except:
        doc_optparse.exit()
    # Open indexed access to mafs
    index = bx.align.maf.MultiIndexed( maf_files, 
                                      parse_e_rows=True,
                                      use_cache=use_cache )
    # Print header
    print("#chr", "start", "end", end=' ')
    for s in species:
        print(s, end=' ')
    print()
    # Iterate over input ranges 
    for line in sys.stdin:
        fields = line.split()
        # Input is BED3+
        chr, start, end = fields[0], int( fields[1] ), int( fields[2] )
        length = end - start
        assert length > 0, "Interval has length less than one"
        # Prepend prefix if specified
        src = prefix + chr    
        # Keep a bitset for each species noting covered pieces
        aligned_bits = []
        missing_bits = []
        for s in species:
            aligned_bits.append( zeros( length, dtype=bool ) )
            missing_bits.append( zeros( length, dtype=bool ) )
        # Find overlap with reference component
        blocks = index.get( src, start, end )
        # Determine alignability for each position
        for block in blocks:
            # Determine the piece of the human interval this block covers, 
            # relative to the start of the interval of interest
            ref = block.get_component_by_src( src )
            assert ref.strand == "+", \
                "Reference species blocks must be on '+' strand"
            rel_start = max( start, ref.start ) - start
            rel_end = min( end, ref.end ) - start
            # Check alignability for each species
            for i, s in enumerate( species ):
                other = block.get_component_by_src_start( s )
                # Species does not appear at all indicates unaligned (best we
                # can do here?)
                if other is None:
                    continue
                # An empty component might indicate missing data, all other
                # cases (even contiguous) we count as not aligned
                if other.empty:
                    if other.synteny_empty == bx.align.maf.MAF_MISSING_STATUS:
                        missing_bits[i][rel_start:rel_end] = True
                # Otherwise we have a local alignment with some text, call
                # it aligned
                else:
                    aligned_bits[i][rel_start:rel_end] = True
        # Now determine the total alignment coverage of each interval
        print(chr, start, end, end=' ')
        for i, s in enumerate( species ):
            aligned = sum( aligned_bits[i] )
            missing = sum( missing_bits[i] )
            # An interval will be called missing if it is < 100bp and <50% 
            # present, or more than 100bp and less that 50bp present (yes,
            # arbitrary)
            is_missing = False
            if length < 100 and missing > ( length / 2 ):
                print("NA", end=' ')
            elif length >= 100 and missing > 50:
                print("NA", end=' ')
            else:
                print(aligned / ( length - missing ), end=' ')
                
        print()
         
    # Close MAF files
    index.close()
コード例 #40
0
def main():

    options, args = doc_optparse.parse( __doc__ )
    try:
        lens = {}
        if options.lens:
            for line in open( options.lens ):
                chrom, length = line.split()
                lens[chrom] = int( length )

        if options.suffix: suffix = options.suffix
        else: suffix = ""

        print >>sys.stderr, "\nReading feature",
        interval_file = open(args[0])
        feature = binned_bitsets_from_file(interval_file, lens=lens)
        interval_file.close()
        # reuse interval file 
        intervals = {}
        interval_file = open(args[0])
        for line in interval_file:
            fields = line.split()
            chrom, start, end = fields[0], int(fields[1]), int(fields[2])
            if chrom not in intervals: intervals[chrom] = []
            intervals[chrom].append( [start,end] )
        interval_file.close()

        print >>sys.stderr, "\nReading ar",
        ar = binned_bitsets_from_file(open( args[1] ), lens=lens)

        print >>sys.stderr, "\nReading snps",
        snp = binned_bitsets_from_file(open( args[2] ), lens=lens)
        snp_mask = clone_inverted( snp )
        snp_copy = clone( snp )

        print >>sys.stderr, "\nMasking AR",
        ar_mask = clone_inverted( ar )
        print >>sys.stderr

        dirname = args[3]


        if options.mask: mask = binned_bitsets_from_file(open(options.mask), lens=lens)
        else: mask = None
    except:
        doc_optparse.exit()
    
    if mask:
        for chrom in mask.keys():
            if chrom in feature: feature[chrom].iand( mask[chrom] )
            if chrom in ar: ar[chrom].iand( mask[chrom] )


    # divergence and snp counts for all features
    feature_div_count = 0
    feature_snp_count = 0
    ar_div_count = 0
    ar_snp_count = 0

    # collect snp and div
    for chr in feature.keys():

        if chr not in snp: continue
        if chr not in ar: continue

        print >>sys.stderr, "reading %s ..." % chr,
        try:
            div = binned_bitsets_from_file( open( dirname + "/%s.bed" % (chr+suffix) ), lens=lens)
        except:
            print >>sys.stderr,"%s.bed not found" % chr
            continue

        div[chr].iand( snp_mask[chr] ) # div/snp sites count snp-only
        div_copy = clone( div )

        print >>sys.stderr, "AR:", chr,
        snp[chr].iand( ar[chr] )
        div[chr].iand( ar[chr] )
        snp_count = snp[chr].count_range(0,snp[chr].size)
        ar_snp_count += snp_count
        print >>sys.stderr, snp_count,
        try:
            div_count = div[chr].count_range(0,div[chr].size)
            ar_div_count += div_count
            print >>sys.stderr, div_count
        except:
            print >>sys.stderr, chr, "failed"
    
        div = div_copy
        snp[chr] = snp_copy[chr]
        print >>sys.stderr, "feature:", chr,
        feature[chr].iand( ar_mask[chr] ) # clip to non-AR only
        snp[chr].iand( feature[chr] )
        div[chr].iand( feature[chr] )
        feature_snp_count += snp[chr].count_range(0,snp[chr].size)
        print >>sys.stderr, snp[chr].count_range(0,snp[chr].size), div[chr].count_range(0,div[chr].size)
        feature_div_count += div[chr].count_range(0,div[chr].size)
        print >>sys.stderr, snp[chr].count_range(0,snp[chr].size), div[chr].count_range(0,div[chr].size)

        # Note: can loop over feature intervals here for individual counts
        if chr in intervals:
            for start,end in intervals[chr]:
                ind_div_count = div[chr].count_range(start,end-start)
                ind_snp_count = snp[chr].count_range(start,end-start)
                print chr, start, end, ind_div_count, ind_snp_count
    
    print "feature snp\t%d" %feature_snp_count
    print "feature div\t%d" %feature_div_count
    print "ar snp\t%d" %ar_snp_count
    print "ar div\t%d" %ar_div_count
コード例 #41
0
def main():
    # Parse Command Line
    options, args = doc_optparse.parse(__doc__)
    try:
        maf_files = args
        if options.mincols: mincols = int(options.mincols)
        else: mincols = 0
        if options.src: fixed_src = options.src
        else: fixed_src = None
        if options.prefix: prefix = options.prefix
        else: prefix = None
        if options.dir: dir = options.dir
        else: dir = None
        chop = bool(options.chop)
        do_strand = bool(options.strand)
        use_cache = bool(options.usecache)
    except:
        doc_optparse.exit()
    # Open indexed access to mafs
    index = bx.align.maf.MultiIndexed(maf_files,
                                      keep_open=True,
                                      parse_e_rows=True,
                                      use_cache=use_cache)
    # Start MAF on stdout
    if dir is None:
        out = bx.align.maf.Writer(sys.stdout)
    # Iterate over input ranges
    for line in sys.stdin:
        strand = None
        fields = line.split()
        if fixed_src:
            src, start, end = fixed_src, int(fields[0]), int(fields[1])
            if do_strand: strand = fields[2]
        else:
            src, start, end = fields[0], int(fields[1]), int(fields[2])
            if do_strand: strand = fields[3]
        if prefix: src = prefix + src
        # Find overlap with reference component
        blocks = index.get(src, start, end)
        # Open file if needed
        if dir:
            out = bx.align.maf.Writer(
                open(os.path.join(dir, "%s:%09d-%09d.maf" % (src, start, end)),
                     'w'))
        # Write each intersecting block
        if chop:
            for block in blocks:
                for ref in block.get_components_by_src(src):
                    slice_start = max(start, ref.get_forward_strand_start())
                    slice_end = min(end, ref.get_forward_strand_end())
                    if (slice_end <= slice_start): continue
                    sliced = block.slice_by_component(ref, slice_start,
                                                      slice_end)
                    # If the block is shorter than the minimum allowed size, stop
                    if mincols and (sliced.text_size < mincols):
                        continue
                    # If the reference component is empty, don't write the block
                    if sliced.get_component_by_src(src).size < 1:
                        continue
                    # Keep only components that are not empty
                    sliced.components = [
                        c for c in sliced.components if c.size > 0
                    ]
                    # Reverse complement if needed
                    if (strand != None) and (ref.strand != strand):
                        sliced = sliced.reverse_complement()
                    # Write the block
                    out.write(sliced)
        else:
            for block in blocks:
                out.write(block)
        if dir:
            out.close()
    # Close output MAF
    out.close()
    index.close()
コード例 #42
0
def main():

    # Parse command line
    options, args = doc_optparse.parse(__doc__)

    try:
        score_fname = args[0]
        interval_fname = args[1]
        chrom_col = args[2]
        start_col = args[3]
        stop_col = args[4]
        if len(args) > 5:
            out_file = open(args[5], 'w')
        else:
            out_file = sys.stdout
        binned = bool(options.binned)
        mask_fname = options.mask
    except Exception:
        doc_optparse.exit()

    if score_fname == 'None':
        stop_err(
            'This tool works with data from genome builds hg16, hg17 or hg18.  Click the pencil icon in your history item to set the genome build if appropriate.'
        )

    try:
        chrom_col = int(chrom_col) - 1
        start_col = int(start_col) - 1
        stop_col = int(stop_col) - 1
    except Exception:
        stop_err(
            'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.'
        )

    if chrom_col < 0 or start_col < 0 or stop_col < 0:
        stop_err(
            'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.'
        )

    if binned:
        scores_by_chrom = load_scores_ba_dir(score_fname)
    else:
        try:
            chrom_buffer = int(options.chrom_buffer)
        except Exception:
            chrom_buffer = 3
        scores_by_chrom = load_scores_wiggle(score_fname, chrom_buffer)

    if mask_fname:
        masks = binned_bitsets_from_file(open(mask_fname))
    else:
        masks = None

    skipped_lines = 0
    first_invalid_line = 0
    invalid_line = ''

    for i, line in enumerate(open(interval_fname)):
        valid = True
        line = line.rstrip('\r\n')
        if line and not line.startswith('#'):
            fields = line.split()

            try:
                chrom, start, stop = fields[chrom_col], int(
                    fields[start_col]), int(fields[stop_col])
            except Exception:
                valid = False
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
            if valid:
                total = 0
                count = 0
                min_score = 100000000
                max_score = -100000000
                for j in range(start, stop):
                    if chrom in scores_by_chrom:
                        try:
                            # Skip if base is masked
                            if masks and chrom in masks:
                                if masks[chrom][j]:
                                    continue
                            # Get the score, only count if not 'nan'
                            score = scores_by_chrom[chrom][j]
                            if not isnan(score):
                                total += score
                                count += 1
                                max_score = max(score, max_score)
                                min_score = min(score, min_score)
                        except Exception:
                            continue
                if count > 0:
                    avg = total / count
                else:
                    avg = "nan"
                    min_score = "nan"
                    max_score = "nan"

                # Build the resulting line of data
                out_line = []
                for k in range(0, len(fields)):
                    out_line.append(fields[k])
                out_line.append(avg)
                out_line.append(min_score)
                out_line.append(max_score)

                print("\t".join(map(str, out_line)), file=out_file)
            else:
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
        elif line.startswith('#'):
            # We'll save the original comments
            print(line, file=out_file)

    out_file.close()

    if skipped_lines > 0:
        print(
            'Data issue: skipped %d invalid lines starting at line #%d which is "%s"'
            % (skipped_lines, first_invalid_line, invalid_line))
        if skipped_lines == i:
            print(
                'Consider changing the metadata for the input dataset by clicking on the pencil icon in the history item.'
            )
コード例 #43
0
def main():

    # Parse command line
    options, args = doc_optparse.parse(__doc__)

    try:
        score_fname = args[0]
        interval_fname = args[1]
        chrom_col = args[2]
        start_col = args[3]
        stop_col = args[4]
        if len(args) > 5:
            out_file = open(args[5], 'w')
        else:
            out_file = sys.stdout
        binned = bool(options.binned)
        mask_fname = options.mask
    except:
        doc_optparse.exit()

    if score_fname == 'None':
        stop_err('This tool works with data from genome builds hg16, hg17 or hg18.  Click the pencil icon in your history item to set the genome build if appropriate.')

    try:
        chrom_col = int(chrom_col) - 1
        start_col = int(start_col) - 1
        stop_col = int(stop_col) - 1
    except:
        stop_err('Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.')

    if chrom_col < 0 or start_col < 0 or stop_col < 0:
        stop_err('Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.')

    if binned:
        scores_by_chrom = load_scores_ba_dir(score_fname)
    else:
        try:
            chrom_buffer = int(options.chrom_buffer)
        except:
            chrom_buffer = 3
        scores_by_chrom = load_scores_wiggle(score_fname, chrom_buffer)

    if mask_fname:
        masks = binned_bitsets_from_file(open(mask_fname))
    else:
        masks = None

    skipped_lines = 0
    first_invalid_line = 0
    invalid_line = ''

    for i, line in enumerate(open(interval_fname)):
        valid = True
        line = line.rstrip('\r\n')
        if line and not line.startswith('#'):
            fields = line.split()

            try:
                chrom, start, stop = fields[chrom_col], int(fields[start_col]), int(fields[stop_col])
            except:
                valid = False
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
            if valid:
                total = 0
                count = 0
                min_score = 100000000
                max_score = -100000000
                for j in range(start, stop):
                    if chrom in scores_by_chrom:
                        try:
                            # Skip if base is masked
                            if masks and chrom in masks:
                                if masks[chrom][j]:
                                    continue
                            # Get the score, only count if not 'nan'
                            score = scores_by_chrom[chrom][j]
                            if not isnan(score):
                                total += score
                                count += 1
                                max_score = max(score, max_score)
                                min_score = min(score, min_score)
                        except:
                            continue
                if count > 0:
                    avg = total / count
                else:
                    avg = "nan"
                    min_score = "nan"
                    max_score = "nan"

                # Build the resulting line of data
                out_line = []
                for k in range(0, len(fields)):
                    out_line.append(fields[k])
                out_line.append(avg)
                out_line.append(min_score)
                out_line.append(max_score)

                print("\t".join(map(str, out_line)), file=out_file)
            else:
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
        elif line.startswith('#'):
            # We'll save the original comments
            print(line, file=out_file)

    out_file.close()

    if skipped_lines > 0:
        print('Data issue: skipped %d invalid lines starting at line #%d which is "%s"' % (skipped_lines, first_invalid_line, invalid_line))
        if skipped_lines == i:
            print('Consider changing the metadata for the input dataset by clicking on the pencil icon in the history item.')
コード例 #44
0
def main():

    options, args = doc_optparse.parse( __doc__ )
    try:
        if options.outfile: 
            out = open( options.outfile, "w")
        else:
            out = sys.stdout
        if options.format:
            format = options.format
        else:
            format = 'bed'

        allpositions = bool( options.allpositions )
        include_name = bool( options.include_name )
        nibdir = args[0]
        bedfile = args[1]
    except:
        doc_optparse.exit()

    nibs = getnib(nibdir)

    for chrom, strand, cds_exons, name in CDSReader( open(bedfile), format=format):

        cds_seq = ''

        # genome_seq_index maps the position in CDS to position on the genome
        genome_seq_index = []
        for (c_start, c_end) in cds_exons:
            cds_seq += nibs[chrom].get( c_start, c_end-c_start )
            for i in range(c_start,c_end):
                genome_seq_index.append(i)

        cds_seq = cds_seq.upper()

        if strand == '+': 
            frsts = range( 0, len(cds_seq), 3)
            offsign = 1
        else: 
            cds_seq = Comp( cds_seq )
            frsts = range( 2, len(cds_seq), 3)
            offsign = -1

        offone = 1 * offsign
        offtwo = 2 * offsign

        all = ['A','C','G','T']

        for first_pos in frsts:
            c1 = first_pos
            c2 = first_pos + offone
            c3 = first_pos + offtwo
            try:
                assert c3 < len(cds_seq)
            except AssertionError:
                print >>sys.stderr, "out of sequence at %d for %s, %d" % (c3, chrom, genome_seq_index[ first_pos ])
                continue
            codon = cds_seq[c1], cds_seq[c2], cds_seq[c3]
            aa = translate( codon, GEN_CODE )
            degeneracy3 = str(GEN_CODE[ codon[0] ][ codon[1] ].values().count(aa)) + "d"

            if not include_name: name_text = ''
            else: 
                name_text = name.replace(' ','_')

            if allpositions:
                try:
                    degeneracy1 = str([GEN_CODE[ k ][ codon[1] ][ codon[2] ] for k in all].count(aa)) + "d"
                    degeneracy2 = str([GEN_CODE[ codon[0] ][ k ][ codon[2] ] for k in all].count(aa)) + "d"
                except TypeError, s:
                    print >>sys.stderr, GEN_CODE.values()
                    raise TypeError, s

                if strand == '+':
                    print >>out, chrom, genome_seq_index[c1], genome_seq_index[c1] + 1, cds_seq[c1], degeneracy1, aa, name_text
                    print >>out, chrom, genome_seq_index[c2], genome_seq_index[c2] + 1, cds_seq[c2], degeneracy2, aa, name_text
                    print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text
                else:
                    print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text
                    print >>out, chrom, genome_seq_index[c2], genome_seq_index[c2] + 1, cds_seq[c2], degeneracy2, aa, name_text
                    print >>out, chrom, genome_seq_index[c1], genome_seq_index[c1] + 1, cds_seq[c1], degeneracy1, aa, name_text
            else:
                if strand == '+':
                    for b in c1,c2:
                        print >>out, chrom, genome_seq_index[b], genome_seq_index[b] + 1, cds_seq[b], "1d", aa, name_text
                    print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text
                else:
                    print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text
                    for b in c2,c1:
                        print >>out, chrom, genome_seq_index[b], genome_seq_index[b] + 1, cds_seq[b], "1d", aa, name_text
コード例 #45
0
def __main__():

    # Parse Command Line

    options, args = doc_optparse.parse( __doc__ )

    try:
        range_filename = args[ 0 ]
        try: 
            refindex = int( args[ 1 ] )
            refname = None
        except: 
            refindex = None
            refname = args[ 1 ]
        if options.mincols: mincols = int( options.mincols )
        else: mincols = 10
        if options.prefix: prefix = options.prefix
        else: prefix = ""
    except:
        doc_optparse.exit()

    # Load Intervals

    intersecters = dict()    
    for line in file( range_filename ):
        fields = line.split()
        src = prefix + fields[0]
        if not src in intersecters: intersecters[src] = intervals.Intersecter()
        intersecters[src].add_interval( intervals.Interval( int( fields[1] ), int( fields[2] ) ) )

    # Start MAF on stdout

    out = bx.align.maf.Writer( sys.stdout )

    # Iterate over input MAF

    for maf in bx.align.maf.Reader( sys.stdin ):
        if refname: 
            sourcenames = [ cmp.src.split('.')[0] for cmp in maf.components ]
            try: refindex = sourcenames.index( refname )
            except:
                continue

        ref_component = maf.components[ refindex ]
        # Find overlap with reference component
        if not ( ref_component.src in intersecters ): continue
        intersections = intersecters[ ref_component.src ].find( ref_component.start, ref_component.end )
        # Keep output maf ordered
        intersections.sort()
        # Write each intersecting block
        for interval in intersections: 
            start = max( interval.start, ref_component.start )
            end = min( interval.end, ref_component.end )
            sliced = maf.slice_by_component( refindex, start, end ) 
            good = True
            for c in sliced.components: 
                if c.size < 1: 
                    good = False
            if good and sliced.text_size > mincols: out.write( sliced )
         
    # Close output MAF

    out.close()