def main(): # Parse command line options, args = doc_optparse.parse(__doc__) try: score_file = open(args[0]) interval_file = open(args[1]) if len(args) > 2: out_file = open(args[2], "w") else: out_file = sys.stdout except: doc_optparse.exit() scores_by_chrom = read_scores(misc.open_compressed(sys.argv[1])) for line in open(sys.argv[2]): fields = line.split() chrom, start, stop = fields[0], int(fields[1]), int(fields[2]) if chrom in scores_by_chrom: ba = scores_by_chrom[chrom] scores = [ba[i] for i in range(start, stop)] else: scores = [] print >> out_file, " ".join(fields), " ".join(map(str, scores)) score_file.close() interval_file.close() out_file.close()
def main(): # Parse command line arguments options, args = doc_optparse.parse( __doc__ ) try: lnorm = bool( options.lnorm ) recalculate = bool( options.recalculate ) except: doc_optparse.exit() hox70 = score.build_scoring_scheme( """ A C G T 91 -114 -31 -123 -114 100 -125 -31 -31 -125 100 -114 -123 -31 -114 91 """, 400, 30, default=0 ) maf_reader = maf.Reader( sys.stdin ) for m in maf_reader: if m.text_size == 0: print "NA" continue s = m.score # Recalculate? if recalculate: s = hox70.score_alignment( m ) # Normalize? if lnorm: s = s / m.text_size # Print print s
def main(): # Parse command line arguments options, args = doc_optparse.parse(__doc__) try: lnorm = bool(options.lnorm) recalculate = bool(options.recalculate) except Exception: doc_optparse.exit() hox70 = score.build_scoring_scheme(""" A C G T 91 -114 -31 -123 -114 100 -125 -31 -31 -125 100 -114 -123 -31 -114 91 """, 400, 30, default=0) maf_reader = maf.Reader(sys.stdin) for m in maf_reader: if m.text_size == 0: print("NA") continue s = m.score # Recalculate? if recalculate: s = hox70.score_alignment(m) # Normalize? if lnorm: s = s / m.text_size # Print print(s)
def main(): options, args = doc_optparse.parse( __doc__ ) try: species = args[0].split(',') nrequired = int( args[1] ) except: doc_optparse.exit() maf_reader = bx.align.maf.Reader( sys.stdin ) interval_start = None interval_end = None for m in maf_reader: ref = m.components[0] # Does this alignment have enough of the required species if nrequired <= len( [ comp for comp in m.components if comp.src.split('.')[0] in species ] ): if interval_start is None: interval_start = ref.start interval_end = ref.end else: if ref.start - interval_end < SPAN: interval_end = ref.end else: if interval_end - interval_start >= MIN: print(ref.src.split('.')[1], interval_start, interval_end) interval_start = ref.start interval_end = ref.end else: if interval_start != None and interval_end - interval_start >= MIN: print(ref.src.split('.')[1], interval_start, interval_end) interval_start = None interval_end = None
def main(): options, args = doc_optparse.parse(__doc__) try: species = args # Allow a comma separated list, TODO: allow a newick format tree if len(species) == 1 and ',' in species[0]: species = species[0].split(',') fuse = not (bool(options.nofuse)) except: doc_optparse.exit() maf_reader = bx.align.maf.Reader(sys.stdin) maf_writer = bx.align.maf.Writer(sys.stdout) if fuse: maf_writer = FusingAlignmentWriter(maf_writer) for m in maf_reader: new_components = get_components_for_species(m, species) if new_components: remove_all_gap_columns(new_components) m.components = new_components m.score = 0.0 maf_writer.write(m) maf_reader.close() maf_writer.close()
def main(): options, args = doc_optparse.parse(__doc__) try: species = args # Allow a comma separated list, TODO: allow a newick format tree if len(species) == 1 and "," in species[0]: species = species[0].split(",") fuse = not (bool(options.nofuse)) except: doc_optparse.exit() maf_reader = bx.align.maf.Reader(sys.stdin) maf_writer = bx.align.maf.Writer(sys.stdout) if fuse: maf_writer = FusingAlignmentWriter(maf_writer) for m in maf_reader: new_components = get_components_for_species(m, species) if new_components: remove_all_gap_columns(new_components) m.components = new_components m.score = 0.0 maf_writer.write(m) maf_reader.close() maf_writer.close()
def main(): # Parse command line options, args = doc_optparse.parse( __doc__ ) try: if options.comp: comp_type = options.comp else: comp_type = None score_fname = args[0] out_fname = args[1] except: doc_optparse.exit() scores = BinnedArray() ## last_chrom = None for i, ( chrom, pos, val ) in enumerate( bx.wiggle.Reader( misc.open_compressed( score_fname ) ) ): #if last_chrom is None: # last_chrom = chrom #else: # assert chrom == last_chrom, "This script expects a 'wiggle' input on only one chromosome" scores[pos] = val # Status if i % 10000 == 0: print i, "scores processed" out = open( out_fname, "w" ) if comp_type: scores.to_file( out, comp_type=comp_type ) else: scores.to_file( out ) out.close()
def main(): # Parse command line options, args = doc_optparse.parse(__doc__) try: score_file = open(args[0]) interval_file = open(args[1]) if len(args) > 2: out_file = open(args[2], 'w') else: out_file = sys.stdout except: doc_optparse.exit() scores_by_chrom = read_scores(misc.open_compressed(sys.argv[1])) for line in open(sys.argv[2]): fields = line.split() chrom, start, stop = fields[0], int(fields[1]), int(fields[2]) if chrom in scores_by_chrom: ba = scores_by_chrom[chrom] scores = [ba[i] for i in range(start, stop)] else: scores = [] print >> out_file, " ".join(fields), " ".join(map(str, scores)) score_file.close() interval_file.close() out_file.close()
def main(): # Parse command line options, args = doc_optparse.parse( __doc__ ) try: maf_file = args[0] # If it appears to be a bz2 file, attempt to open with table if maf_file.endswith( ".bz2" ): table_file = maf_file + "t" if not os.path.exists( table_file ): doc_optparse.exit( "To index bz2 compressed files first " "create a bz2t file with bzip-table." ) # Open with SeekableBzip2File so we have tell support maf_in = SeekableBzip2File( maf_file, table_file ) # Strip .bz2 from the filename before adding ".index" maf_file = maf_file[:-4] elif maf_file.endswith( ".lzo" ): from bx.misc.seeklzop import SeekableLzopFile table_file = maf_file + "t" if not os.path.exists( table_file ): doc_optparse.exit( "To index lzo compressed files first " "create a lzot file with lzop_build_offset_table." ) # Open with SeekableBzip2File so we have tell support maf_in = SeekableLzopFile( maf_file, table_file ) # Strip .lzo from the filename before adding ".index" maf_file = maf_file[:-4] else: maf_in = open( maf_file ) # Determine the name of the index file if len( args ) > 1: index_file = args[1] else: index_file = maf_file + ".index" if options.species: species = options.species.split( "," ) else: species = None except: doc_optparse.exception() maf_reader = bx.align.maf.Reader( maf_in ) indexes = interval_index_file.Indexes() # Need to be a bit tricky in our iteration here to get the 'tells' right while 1: pos = maf_reader.file.tell() block = maf_reader.next() if block is None: break for c in block.components: if species is not None and c.src.split('.')[0] not in species: continue indexes.add( c.src, c.forward_strand_start, c.forward_strand_end, pos, max=c.src_size ) out = open( index_file, 'w' ) indexes.write( out ) out.close()
def main(): # Parse command line options, args = doc_optparse.parse(__doc__) try: score_fname = args[0] interval_fname = args[1] if len(args) > 2: out_file = open(args[2], 'w') else: out_file = sys.stdout binned = bool(options.binned) mask_fname = options.mask except Exception: doc_optparse.exit() if binned: scores_by_chrom = load_scores_ba_dir(score_fname) else: scores_by_chrom = load_scores_wiggle(score_fname) if mask_fname: masks = binned_bitsets_from_file(open(mask_fname)) else: masks = None for line in open(interval_fname): fields = line.split() chrom, start, stop = fields[0], int(fields[1]), int(fields[2]) total = 0 count = 0 min_score = 100000000 max_score = -100000000 for i in range(start, stop): if chrom in scores_by_chrom and scores_by_chrom[chrom][i]: # Skip if base is masked if masks and chrom in masks: if masks[chrom][i]: continue # Get the score, only count if not 'nan' score = scores_by_chrom[chrom][i] if not isNaN(score): total += score count += 1 max_score = max(score, max_score) min_score = min(score, min_score) if count > 0: avg = total / count else: avg = "nan" min_score = "nan" max_score = "nan" print("\t".join( map(str, [chrom, start, stop, avg, min_score, max_score])), file=out_file) out_file.close()
def main(): # Parse command line options, args = doc_optparse.parse(__doc__) try: score_fname = args[0] interval_fname = args[1] if len(args) > 2: out_file = open(args[2], "w") else: out_file = sys.stdout binned = bool(options.binned) mask_fname = options.mask except: doc_optparse.exit() if binned: scores_by_chrom = load_scores_ba_dir(score_fname) else: scores_by_chrom = load_scores_wiggle(score_fname) if mask_fname: masks = binned_bitsets_from_file(open(mask_fname)) else: masks = None for line in open(interval_fname): fields = line.split() chrom, start, stop = fields[0], int(fields[1]), int(fields[2]) total = 0 count = 0 min_score = 100000000 max_score = -100000000 for i in range(start, stop): if chrom in scores_by_chrom and scores_by_chrom[chrom][i]: # Skip if base is masked if masks and chrom in masks: if masks[chrom][i]: continue # Get the score, only count if not 'nan' score = scores_by_chrom[chrom][i] if not isNaN(score): total += score count += 1 max_score = max(score, max_score) min_score = min(score, min_score) if count > 0: avg = total / count else: avg = "nan" min_score = "nan" max_score = "nan" print >> out_file, "\t".join(map(str, [chrom, start, stop, avg, min_score, max_score])) out_file.close()
def __main__(): # Parse Command Line options, args = doc_optparse.parse(__doc__) try: maf_files = args if options.prefix: prefix = options.prefix else: prefix = None except: doc_optparse.exit() # Open indexed access to mafs indexes = [ bx.align.maf.Indexed(maf_file, maf_file + ".index") for maf_file in maf_files ] # Iterate over input ranges for line in sys.stdin: fields = line.split() src, start, end = fields[0], int(fields[1]), int(fields[2]) if prefix: src = prefix + src total_length = end - start # Find overlap with reference component blocks = [] for index in indexes: blocks += index.get(src, start, end) coverage = dict() for block in blocks: overlap_start = max(start, block.components[0].start) overlap_end = min(end, block.components[0].end) length = overlap_end - overlap_start assert length > 0 for c in block.components[1:]: species = c.src.split('.')[0] try: coverage[species] += length except: coverage[species] = length print(line, end=' ') for key, value in coverage.items(): print(" ", key.ljust(10), "%0.2f" % (value / total_length))
def __main__(): # Parse Command Line options, args = doc_optparse.parse(__doc__) try: range_filename = args[0] refindex = int(args[1]) if options.mincols: mincols = int(options.mincols) else: mincols = 10 except: doc_optparse.exit() # Load Intervals intersecter = intervals.Intersecter() for line in file(range_filename): fields = line.split() intersecter.add_interval( intervals.Interval(int(fields[0]), int(fields[1]))) # Start axt on stdout out = bx.align.axt.Writer(sys.stdout) # Iterate over input axt for axt in bx.align.axt.Reader(sys.stdin): ref_component = axt.components[refindex] # Find overlap with reference component intersections = intersecter.find(ref_component.start, ref_component.end) # Keep output axt ordered intersections.sort() # Write each intersecting block for interval in intersections: start = max(interval.start, ref_component.start) end = min(interval.end, ref_component.end) sliced = axt.slice_by_component(refindex, start, end) good = True for c in sliced.components: if c.size < 1: good = False if good and sliced.text_size > mincols: out.write(sliced) # Close output axt out.close()
def __main__(): # Parse command line arguments options, args = doc_optparse.parse( __doc__ ) try: refindex = int( args[0] ) except: doc_optparse.exit() maf_reader = maf.Reader( sys.stdin ) for m in maf_reader: c = m.components[ refindex ].src print c[ c.rfind( "chr" ) + 3 : ]
def __main__(): # Parse command line arguments options, args = doc_optparse.parse(__doc__) try: refindex = int(args[0]) except Exception: doc_optparse.exit() maf_reader = maf.Reader(sys.stdin) for m in maf_reader: c = m.components[refindex].src print(c[c.rfind("chr") + 3:])
def __main__(): # Parse Command Line options, args = doc_optparse.parse( __doc__ ) try: range_filename = args[ 0 ] refindex = int( args[ 1 ] ) if options.mincols: mincols = int( options.mincols ) else: mincols = 10 except: doc_optparse.exit() # Load Intervals intersecter = intervals.Intersecter() for line in file( range_filename ): fields = line.split() intersecter.add_interval( intervals.Interval( int( fields[0] ), int( fields[1] ) ) ) # Start axt on stdout out = bx.align.axt.Writer( sys.stdout ) # Iterate over input axt for axt in bx.align.axt.Reader( sys.stdin ): ref_component = axt.components[ refindex ] # Find overlap with reference component intersections = intersecter.find( ref_component.start, ref_component.end ) # Keep output axt ordered intersections.sort() # Write each intersecting block for interval in intersections: start = max( interval.start, ref_component.start ) end = min( interval.end, ref_component.end ) sliced = axt.slice_by_component( refindex, start, end ) good = True for c in sliced.components: if c.size < 1: good = False if good and sliced.text_size > mincols: out.write( sliced ) # Close output axt out.close()
def __main__(): options, args = doc_optparse.parse( __doc__ ) try: range_file = file( args[0] ) nib_file = file( args[1] ) except: doc_optparse.exit() nib = bx.seq.nib.NibFile( nib_file ) for line in range_file: fields = line.split() start, end = int( fields[0] ), int( fields[1] ) print ">", start, end print_wrapped( nib.get( start, end - start ) )
def __main__(): options, args = doc_optparse.parse(__doc__) try: range_file = file(args[0]) nib_file = file(args[1]) except: doc_optparse.exit() nib = bx.seq.nib.NibFile(nib_file) for line in range_file: fields = line.split() start, end = int(fields[0]), int(fields[1]) print ">", start, end print_wrapped(nib.get(start, end - start))
def __main__(): options, args = doc_optparse.parse(__doc__) try: if options.cols: action = "cols" elif options.bases: action = "bases" else: action = "aligns" print_each = bool(options.each) if options.ref: ref = int(options.ref) else: ref = 0 if options.skip: skip = options.skip else: skip = None except Exception: doc_optparse.exit() maf_reader = bx.align.maf.Reader(sys.stdin) count = 0 for m in maf_reader: if action == "aligns": count += 1 elif action == "cols": count += m.text_size elif action == "bases": if skip: count += (m.components[ref].size - m.components[ref].text.count(skip)) else: count += m.components[ref].size if print_each: print(count) count = 0 if not print_each: print(count)
def __main__(): # Parse Command Line options, args = doc_optparse.parse( __doc__ ) try: maf_files = args if options.prefix: prefix = options.prefix else: prefix = None except: doc_optparse.exit() # Open indexed access to mafs indexes = [ bx.align.maf.Indexed( maf_file, maf_file + ".index" ) for maf_file in maf_files ] # Iterate over input ranges for line in sys.stdin: fields = line.split() src, start, end = fields[0], int( fields[1] ), int( fields[2] ) if prefix: src = prefix + src total_length = end - start # Find overlap with reference component blocks = [] for index in indexes: blocks += index.get( src, start, end ) coverage = dict() for block in blocks: overlap_start = max( start, block.components[0].start ) overlap_end = min( end, block.components[0].end ) length = overlap_end - overlap_start assert length > 0 for c in block.components[1:]: species = c.src.split( '.' )[0] try: coverage[ species ] += length except: coverage[ species ] = length print(line, end=' ') for key, value in coverage.items(): print(" ", key.ljust(10), "%0.2f" % ( value / total_length ))
def __main__(): options, args = doc_optparse.parse( __doc__ ) try: nib_dir = args[0] except: doc_optparse.exit() nibs = {} for line in sys.stdin: fields = line.split() chrom, start, end = fields[0], int( fields[1] ), int( fields[2] ) print ">", chrom, start, end if chrom in nibs: nib = nibs[chrom] else: nibs[chrom] = nib = bx.seq.nib.NibFile( file( "%s/%s.nib" % ( nib_dir, chrom ) ) ) print_wrapped( nib.get( start, end - start ) )
def main(): # Parse command line options, args = doc_optparse.parse( __doc__ ) try: score_fname = args[0] except: doc_optparse.exit() scores = {} for i, ( chrom, pos, val ) in enumerate( bx.wiggle.Reader( open(sys.argv[1]) ) ): if not chrom in scores: scores[ chrom ] = BinnedArray() scores[chrom][pos] = val # Status if i % 10000 == 0: print i, "scores processed" for chr in scores.keys(): out = open( chr, "w" ) scores[chr].to_file( out ) out.close()
def __main__(): # Parse Command Line options, args = doc_optparse.parse(__doc__) try: assert len(args) > 0 except: doc_optparse.exit() # Load Intervals intersector = intervals.Intersecter() for f in args: for line in file(f): if line.startswith("#") or line.isspace(): continue fields = line.split() intersector.add_interval( intervals.Interval(int(fields[0]), int(fields[1]))) # Start MAF on stdout out = bx.align.maf.Writer(sys.stdout) # Iterate over input MAF for maf in bx.align.maf.Reader(sys.stdin): # Find overlap with reference component intersections = intersector.find(maf.components[0].start, maf.components[0].end) # Write only if no overlap if len(intersections) == 0: out.write(maf) # Close output MAF out.close()
def __main__(): options, args = doc_optparse.parse( __doc__ ) try: if options.cols: action = "cols" elif options.bases: action = "bases" else: action = "aligns" print_each = bool( options.each ) if options.ref: ref = int( options.ref ) else: ref = 0 if options.skip: skip = options.skip else: skip = None except: doc_optparse.exit() maf_reader = bx.align.maf.Reader( sys.stdin ) count = 0 for m in maf_reader: if action == "aligns": count += 1 elif action == "cols": count += m.text_size elif action == "bases": if skip: count += ( m.components[ref].size - m.components[ref].text.count( skip ) ) else: count += m.components[ref].size if print_each: print count count = 0 if not print_each: print count
def __main__(): # Parse Command Line options, args = doc_optparse.parse( __doc__ ) try: assert len( args ) > 0 except: doc_optparse.exit() # Load Intervals intersector = intervals.Intersecter() for f in args: for line in file( f ): if line.startswith( "#" ) or line.isspace(): continue fields = line.split() intersector.add_interval( intervals.Interval( int( fields[0] ), int( fields[1] ) ) ) # Start MAF on stdout out = bx.align.maf.Writer( sys.stdout ) # Iterate over input MAF for maf in bx.align.maf.Reader( sys.stdin ): # Find overlap with reference component intersections = intersector.find( maf.components[0].start, maf.components[0].end ) # Write only if no overlap if len( intersections ) == 0: out.write( maf ) # Close output MAF out.close()
def main(): # Parse Command Line options, args = doc_optparse.parse( __doc__ ) if options.version: return try: wiggle_files = args if options.src: fixed_src = options.src else: fixed_src = None if options.prefix: prefix = options.prefix else: prefix = None do_strand = bool( options.strand ) use_cache = bool( options.usecache ) except: doc_optparse.exit() # Open indexed access to wiggles index = Wiggle.WiggleMultiIndexedAccess( wiggle_files, keep_open = True, use_cache=use_cache ) for line in sys.stdin: strand = "+" fields = line.split() if fixed_src: src, start, end = fixed_src, int( fields[0] ), int( fields[1] ) if do_strand: strand = fields[2] else: src, start, end = fields[0], int( fields[1] ), int( fields[2] ) if do_strand: strand = fields[3] if prefix: src = prefix + src blocks = index.get( src, start, end ) for x, values in blocks: for v in values: print "%s\t%i\t%f" % (src, x, v) x += 1
def main(): # Parse Command Line options, args = doc_optparse.parse(__doc__) if options.version: return try: wiggle_files = args if options.src: fixed_src = options.src else: fixed_src = None if options.prefix: prefix = options.prefix else: prefix = None do_strand = bool(options.strand) use_cache = bool(options.usecache) except: doc_optparse.exit() # Open indexed access to wiggles index = Wiggle.WiggleMultiIndexedAccess(wiggle_files, keep_open=True, use_cache=use_cache) for line in sys.stdin: strand = "+" fields = line.split() if fixed_src: src, start, end = fixed_src, int(fields[0]), int(fields[1]) if do_strand: strand = fields[2] else: src, start, end = fields[0], int(fields[1]), int(fields[2]) if do_strand: strand = fields[3] if prefix: src = prefix + src blocks = index.get(src, start, end) for x, values in blocks: for v in values: print "%s\t%i\t%f" % (src, x, v) x += 1
def main(): # Parse command line options, args = doc_optparse.parse(__doc__) if options.version: return try: wiggle_file = args[0] # If it appears to be a bz2 file, attempt to open with table if wiggle_file.endswith(".bz2"): table_file = wiggle_file + "t" if not os.path.exists(table_file): doc_optparse.exit("To index bz2 compressed files first " "create a bz2t file with bzip-table.") # Open with SeekableBzip2File so we have tell support wiggle_in = SeekableBzip2File(wiggle_file, table_file) # Strip .bz2 from the filename before adding ".index" wiggle_file = wiggle_file[:-4] elif wiggle_file.endswith(".lzo"): from bx.misc.seeklzop import SeekableLzopFile table_file = wiggle_file + "t" if not os.path.exists(table_file): doc_optparse.exit("To index lzo compressed files first " "create a lzot file with bzip-table.") # Open with SeekableBzip2File so we have tell support wiggle_in = SeekableLzopFile(wiggle_file, table_file) # Strip .lzo from the filename before adding ".index" wiggle_file = wiggle_file[:-4] else: wiggle_in = open(wiggle_file) # Determine the name of the index file if len(args) > 1: index_file = args[1] else: index_file = wiggle_file + ".index" except: doc_optparse.exception() indexes = interval_index_file.Indexes() # Can't use the iterator, as there is no next() and thus # no way to access the positions. The following code is # modified from wiggle.py last_chrom = None start = None end = None first_pos = None # always for wiggle data strand = '+' mode = "bed" while 1: pos = wiggle_in.tell() line = wiggle_in.readline() if not line: break if line.isspace() or line.startswith("track") or line.startswith( "#") or line.startswith("browser"): continue elif line.startswith("bed"): indexes.add(fields[0], int(fields[1]), int(fields[2]), pos) elif line.startswith("variableStep") or line.startswith("fixedStep"): if first_pos != None: indexes.add(last_chrom, start, end, first_pos) first_pos = pos header = bx.wiggle.parse_header(line) last_chrom = header['chrom'] start = int(header['start']) - 1 end = start current_step = None if 'span' in header: current_span = int(header['span']) else: current_span = 1 if 'step' in header: current_step = int(header['step']) if line.startswith("variableStep"): mode = "variableStep" else: mode = "fixedStep" elif mode == "variableStep": fields = line.split() end = int(fields[0]) - 1 + current_span elif mode == "fixedStep": end += current_step else: raise "Unexpected input line: %s" % line.strip() out = open(index_file, 'w') indexes.write(out) out.close()
def read_len(f): """Read a 'LEN' file and return a mapping from chromosome to length""" mapping = dict() for line in f: fields = line.split() mapping[fields[0]] = int(fields[1]) return mapping options, args = doc_optparse.parse(__doc__) try: in_fname, len_fname = args except: doc_optparse.exit() bitsets = binned_bitsets_from_file(open(in_fname)) lens = read_len(open(len_fname)) for chrom in lens: if chrom in bitsets: bits = bitsets[chrom] bits.invert() len = lens[chrom] end = 0 while 1: start = bits.next_set(end) if start == bits.size: break end = bits.next_clear(start)
def main(): # Parse Command Line options, args = doc_optparse.parse(__doc__) try: maf_files = args species = options.species.split(",") prefix = options.prefix use_cache = bool(options.usecache) if not prefix: prefix = "" except: doc_optparse.exit() # Open indexed access to mafs index = bx.align.maf.MultiIndexed(maf_files, parse_e_rows=True, use_cache=use_cache) # Print header print "#chr", "start", "end", for s in species: print s, print # Iterate over input ranges for line in sys.stdin: fields = line.split() # Input is BED3+ chr, start, end = fields[0], int(fields[1]), int(fields[2]) length = end - start assert length > 0, "Interval has length less than one" # Prepend prefix if specified src = prefix + chr # Keep a bitset for each species noting covered pieces aligned_bits = [] missing_bits = [] for s in species: aligned_bits.append(zeros(length, dtype=bool)) missing_bits.append(zeros(length, dtype=bool)) # Find overlap with reference component blocks = index.get(src, start, end) # Determine alignability for each position for block in blocks: # Determine the piece of the human interval this block covers, # relative to the start of the interval of interest ref = block.get_component_by_src(src) assert ref.strand == "+", \ "Reference species blocks must be on '+' strand" rel_start = max(start, ref.start) - start rel_end = min(end, ref.end) - start # Check alignability for each species for i, s in enumerate(species): other = block.get_component_by_src_start(s) # Species does not appear at all indicates unaligned (best we # can do here?) if other is None: continue # An empty component might indicate missing data, all other # cases (even contiguous) we count as not aligned if other.empty: if other.synteny_empty == bx.align.maf.MAF_MISSING_STATUS: missing_bits[i][rel_start:rel_end] = True # Otherwise we have a local alignment with some text, call # it aligned else: aligned_bits[i][rel_start:rel_end] = True # Now determine the total alignment coverage of each interval print chr, start, end, for i, s in enumerate(species): aligned = sum(aligned_bits[i]) missing = sum(missing_bits[i]) # An interval will be called missing if it is < 100bp and <50% # present, or more than 100bp and less that 50bp present (yes, # arbitrary) is_missing = False if length < 100 and missing > (length / 2): print "NA", elif length >= 100 and missing > 50: print "NA", else: print aligned / (length - missing), print # Close MAF files index.close()
from bx.bitset_builders import binned_bitsets_from_file from bx.cookbook import doc_optparse def print_bits_as_bed( bits ): end = 0 while 1: start = bits.next_set( end ) if start == bits.size: break end = bits.next_clear( start ) print "%s\t%d\t%d" % ( chrom, start, end ) options, args = doc_optparse.parse( __doc__ ) try: in_fname, in2_fname = args except: doc_optparse.exit() # Read first bed into some bitsets bitsets1 = binned_bitsets_from_file( open( in_fname ) ) bitsets2 = binned_bitsets_from_file( open( in2_fname ) ) for chrom in bitsets1: if chrom not in bitsets1: continue bits1 = bitsets1[chrom] if chrom in bitsets2: bits2 = bitsets2[chrom] bits2.invert() bits1.iand( bits2 ) print_bits_as_bed( bits1 )
def main(): options, args = doc_optparse.parse(__doc__) try: lens = {} if options.lens: for line in open(options.lens): chrom, length = line.split() lens[chrom] = int(length) if options.suffix: suffix = options.suffix else: suffix = "" print("\nReading feature", end=' ', file=sys.stderr) interval_file = open(args[0]) feature = binned_bitsets_from_file(interval_file, lens=lens) interval_file.close() # reuse interval file intervals = {} interval_file = open(args[0]) for line in interval_file: fields = line.split() chrom, start, end = fields[0], int(fields[1]), int(fields[2]) if chrom not in intervals: intervals[chrom] = [] intervals[chrom].append([start, end]) interval_file.close() print("\nReading ar", end=' ', file=sys.stderr) ar = binned_bitsets_from_file(open(args[1]), lens=lens) print("\nReading snps", end=' ', file=sys.stderr) snp = binned_bitsets_from_file(open(args[2]), lens=lens) snp_mask = clone_inverted(snp) snp_copy = clone(snp) print("\nMasking AR", end=' ', file=sys.stderr) ar_mask = clone_inverted(ar) print(file=sys.stderr) dirname = args[3] if options.mask: mask = binned_bitsets_from_file(open(options.mask), lens=lens) else: mask = None except: doc_optparse.exit() if mask: for chrom in mask.keys(): if chrom in feature: feature[chrom].iand(mask[chrom]) if chrom in ar: ar[chrom].iand(mask[chrom]) # divergence and snp counts for all features feature_div_count = 0 feature_snp_count = 0 ar_div_count = 0 ar_snp_count = 0 # collect snp and div for chr in feature.keys(): if chr not in snp: continue if chr not in ar: continue print("reading %s ..." % chr, end=' ', file=sys.stderr) try: div = binned_bitsets_from_file(open(dirname + "/%s.bed" % (chr + suffix)), lens=lens) except: print("%s.bed not found" % chr, file=sys.stderr) continue div[chr].iand(snp_mask[chr]) # div/snp sites count snp-only div_copy = clone(div) print("AR:", chr, end=' ', file=sys.stderr) snp[chr].iand(ar[chr]) div[chr].iand(ar[chr]) snp_count = snp[chr].count_range(0, snp[chr].size) ar_snp_count += snp_count print(snp_count, end=' ', file=sys.stderr) try: div_count = div[chr].count_range(0, div[chr].size) ar_div_count += div_count print(div_count, file=sys.stderr) except: print(chr, "failed", file=sys.stderr) div = div_copy snp[chr] = snp_copy[chr] print("feature:", chr, end=' ', file=sys.stderr) feature[chr].iand(ar_mask[chr]) # clip to non-AR only snp[chr].iand(feature[chr]) div[chr].iand(feature[chr]) feature_snp_count += snp[chr].count_range(0, snp[chr].size) print(snp[chr].count_range(0, snp[chr].size), div[chr].count_range(0, div[chr].size), file=sys.stderr) feature_div_count += div[chr].count_range(0, div[chr].size) print(snp[chr].count_range(0, snp[chr].size), div[chr].count_range(0, div[chr].size), file=sys.stderr) # Note: can loop over feature intervals here for individual counts if chr in intervals: for start, end in intervals[chr]: ind_div_count = div[chr].count_range(start, end - start) ind_snp_count = snp[chr].count_range(start, end - start) print(chr, start, end, ind_div_count, ind_snp_count) print("feature snp\t%d" % feature_snp_count) print("feature div\t%d" % feature_div_count) print("ar snp\t%d" % ar_snp_count) print("ar div\t%d" % ar_div_count)
def main(): # Parse Command Line options, args = doc_optparse.parse( __doc__ ) try: maf_files = args if options.mincols: mincols = int( options.mincols ) else: mincols = 0 if options.src: fixed_src = options.src else: fixed_src = None if options.prefix: prefix = options.prefix else: prefix = None if options.dir: dir = options.dir else: dir = None chop = bool( options.chop ) do_strand = bool( options.strand ) use_cache = bool( options.usecache ) except: doc_optparse.exit() # Open indexed access to mafs index = bx.align.maf.MultiIndexed( maf_files, keep_open=True, parse_e_rows=True, use_cache=use_cache ) # Start MAF on stdout if dir is None: out = bx.align.maf.Writer( sys.stdout ) # Iterate over input ranges for line in sys.stdin: strand = None fields = line.split() if fixed_src: src, start, end = fixed_src, int( fields[0] ), int( fields[1] ) if do_strand: strand = fields[2] else: src, start, end = fields[0], int( fields[1] ), int( fields[2] ) if do_strand: strand = fields[3] if prefix: src = prefix + src # Find overlap with reference component blocks = index.get( src, start, end ) # Open file if needed if dir: out = bx.align.maf.Writer( open( os.path.join( dir, "%s:%09d-%09d.maf" % ( src, start, end ) ), 'w' ) ) # Write each intersecting block if chop: for block in blocks: for ref in block.get_components_by_src( src ): slice_start = max( start, ref.get_forward_strand_start() ) slice_end = min( end, ref.get_forward_strand_end() ) if (slice_end <= slice_start): continue sliced = block.slice_by_component( ref, slice_start, slice_end ) # If the block is shorter than the minimum allowed size, stop if mincols and ( sliced.text_size < mincols ): continue # If the reference component is empty, don't write the block if sliced.get_component_by_src( src ).size < 1: continue # Keep only components that are not empty sliced.components = [ c for c in sliced.components if c.size > 0 ] # Reverse complement if needed if ( strand != None ) and ( ref.strand != strand ): sliced = sliced.reverse_complement() # Write the block out.write( sliced ) else: for block in blocks: out.write( block ) if dir: out.close() # Close output MAF out.close() index.close()
def main(): options, args = doc_optparse.parse( __doc__ ) try: if options.outfile: out = open( options.outfile, "w") else: out = sys.stdout if options.format: format = options.format else: format = 'bed' allpositions = bool( options.allpositions ) include_name = bool( options.include_name ) nibdir = args[0] bedfile = args[1] except: doc_optparse.exit() nibs = getnib(nibdir) for chrom, strand, cds_exons, name in CDSReader( open(bedfile), format=format): cds_seq = '' # genome_seq_index maps the position in CDS to position on the genome genome_seq_index = [] for (c_start, c_end) in cds_exons: cds_seq += nibs[chrom].get( c_start, c_end-c_start ) for i in range(c_start,c_end): genome_seq_index.append(i) cds_seq = cds_seq.upper() if strand == '+': frsts = range( 0, len(cds_seq), 3) offsign = 1 else: cds_seq = Comp( cds_seq ) frsts = range( 2, len(cds_seq), 3) offsign = -1 offone = 1 * offsign offtwo = 2 * offsign all = ['A','C','G','T'] for first_pos in frsts: c1 = first_pos c2 = first_pos + offone c3 = first_pos + offtwo try: assert c3 < len(cds_seq) except AssertionError: print >>sys.stderr, "out of sequence at %d for %s, %d" % (c3, chrom, genome_seq_index[ first_pos ]) continue codon = cds_seq[c1], cds_seq[c2], cds_seq[c3] aa = translate( codon, GEN_CODE ) degeneracy3 = str(GEN_CODE[ codon[0] ][ codon[1] ].values().count(aa)) + "d" if not include_name: name_text = '' else: name_text = name.replace(' ','_') if allpositions: try: degeneracy1 = str([GEN_CODE[ k ][ codon[1] ][ codon[2] ] for k in all].count(aa)) + "d" degeneracy2 = str([GEN_CODE[ codon[0] ][ k ][ codon[2] ] for k in all].count(aa)) + "d" except TypeError, s: print >>sys.stderr, GEN_CODE.values() raise TypeError, s if strand == '+': print >>out, chrom, genome_seq_index[c1], genome_seq_index[c1] + 1, cds_seq[c1], degeneracy1, aa, name_text print >>out, chrom, genome_seq_index[c2], genome_seq_index[c2] + 1, cds_seq[c2], degeneracy2, aa, name_text print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text else: print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text print >>out, chrom, genome_seq_index[c2], genome_seq_index[c2] + 1, cds_seq[c2], degeneracy2, aa, name_text print >>out, chrom, genome_seq_index[c1], genome_seq_index[c1] + 1, cds_seq[c1], degeneracy1, aa, name_text else: if strand == '+': for b in c1,c2: print >>out, chrom, genome_seq_index[b], genome_seq_index[b] + 1, cds_seq[b], "1d", aa, name_text print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text else: print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text for b in c2,c1: print >>out, chrom, genome_seq_index[b], genome_seq_index[b] + 1, cds_seq[b], "1d", aa, name_text
def main(): # Parse command line options, args = doc_optparse.parse( __doc__ ) if options.version: return try: wiggle_file = args[0] # If it appears to be a bz2 file, attempt to open with table if wiggle_file.endswith( ".bz2" ): table_file = wiggle_file + "t" if not os.path.exists( table_file ): doc_optparse.exit( "To index bz2 compressed files first " "create a bz2t file with bzip-table." ) # Open with SeekableBzip2File so we have tell support wiggle_in = SeekableBzip2File( wiggle_file, table_file ) # Strip .bz2 from the filename before adding ".index" wiggle_file = wiggle_file[:-4] elif wiggle_file.endswith( ".lzo" ): from bx.misc.seeklzop import SeekableLzopFile table_file = wiggle_file + "t" if not os.path.exists( table_file ): doc_optparse.exit( "To index lzo compressed files first " "create a lzot file with bzip-table." ) # Open with SeekableBzip2File so we have tell support wiggle_in = SeekableLzopFile( wiggle_file, table_file ) # Strip .lzo from the filename before adding ".index" wiggle_file = wiggle_file[:-4] else: wiggle_in = open( wiggle_file ) # Determine the name of the index file if len( args ) > 1: index_file = args[1] else: index_file = wiggle_file + ".index" except: doc_optparse.exception() indexes = interval_index_file.Indexes() # Can't use the iterator, as there is no next() and thus # no way to access the positions. The following code is # modified from wiggle.py last_chrom = None start = None end = None first_pos = None # always for wiggle data strand = '+' mode = "bed" while 1: pos = wiggle_in.tell() line = wiggle_in.readline() if not line: break if line.isspace() or line.startswith( "track" ) or line.startswith( "#" ) or line.startswith( "browser" ): continue elif line.startswith( "bed" ): indexes.add( fields[0], int( fields[1] ), int( fields[2] ), pos ) elif line.startswith( "variableStep" ) or line.startswith( "fixedStep"): if first_pos != None: indexes.add( last_chrom, start, end, first_pos ) first_pos = pos header = bx.wiggle.parse_header( line ) last_chrom = header['chrom'] start = int(header['start']) - 1 end = start current_step = None if 'span' in header: current_span = int( header['span'] ) else: current_span = 1 if 'step' in header: current_step = int( header['step'] ) if line.startswith( "variableStep" ): mode = "variableStep" else: mode = "fixedStep" elif mode == "variableStep": fields = line.split() end = int( fields[0] ) - 1 + current_span elif mode == "fixedStep": end += current_step else: raise "Unexpected input line: %s" % line.strip() out = open( index_file, 'w' ) indexes.write( out ) out.close()
def main(): # Parse Command Line options, args = doc_optparse.parse( __doc__ ) try: maf_files = args species = options.species.split( "," ) prefix = options.prefix use_cache = bool( options.usecache ) if not prefix: prefix = "" except: doc_optparse.exit() # Open indexed access to mafs index = bx.align.maf.MultiIndexed( maf_files, parse_e_rows=True, use_cache=use_cache ) # Print header print("#chr", "start", "end", end=' ') for s in species: print(s, end=' ') print() # Iterate over input ranges for line in sys.stdin: fields = line.split() # Input is BED3+ chr, start, end = fields[0], int( fields[1] ), int( fields[2] ) length = end - start assert length > 0, "Interval has length less than one" # Prepend prefix if specified src = prefix + chr # Keep a bitset for each species noting covered pieces aligned_bits = [] missing_bits = [] for s in species: aligned_bits.append( zeros( length, dtype=bool ) ) missing_bits.append( zeros( length, dtype=bool ) ) # Find overlap with reference component blocks = index.get( src, start, end ) # Determine alignability for each position for block in blocks: # Determine the piece of the human interval this block covers, # relative to the start of the interval of interest ref = block.get_component_by_src( src ) assert ref.strand == "+", \ "Reference species blocks must be on '+' strand" rel_start = max( start, ref.start ) - start rel_end = min( end, ref.end ) - start # Check alignability for each species for i, s in enumerate( species ): other = block.get_component_by_src_start( s ) # Species does not appear at all indicates unaligned (best we # can do here?) if other is None: continue # An empty component might indicate missing data, all other # cases (even contiguous) we count as not aligned if other.empty: if other.synteny_empty == bx.align.maf.MAF_MISSING_STATUS: missing_bits[i][rel_start:rel_end] = True # Otherwise we have a local alignment with some text, call # it aligned else: aligned_bits[i][rel_start:rel_end] = True # Now determine the total alignment coverage of each interval print(chr, start, end, end=' ') for i, s in enumerate( species ): aligned = sum( aligned_bits[i] ) missing = sum( missing_bits[i] ) # An interval will be called missing if it is < 100bp and <50% # present, or more than 100bp and less that 50bp present (yes, # arbitrary) is_missing = False if length < 100 and missing > ( length / 2 ): print("NA", end=' ') elif length >= 100 and missing > 50: print("NA", end=' ') else: print(aligned / ( length - missing ), end=' ') print() # Close MAF files index.close()
def main(): options, args = doc_optparse.parse( __doc__ ) try: lens = {} if options.lens: for line in open( options.lens ): chrom, length = line.split() lens[chrom] = int( length ) if options.suffix: suffix = options.suffix else: suffix = "" print >>sys.stderr, "\nReading feature", interval_file = open(args[0]) feature = binned_bitsets_from_file(interval_file, lens=lens) interval_file.close() # reuse interval file intervals = {} interval_file = open(args[0]) for line in interval_file: fields = line.split() chrom, start, end = fields[0], int(fields[1]), int(fields[2]) if chrom not in intervals: intervals[chrom] = [] intervals[chrom].append( [start,end] ) interval_file.close() print >>sys.stderr, "\nReading ar", ar = binned_bitsets_from_file(open( args[1] ), lens=lens) print >>sys.stderr, "\nReading snps", snp = binned_bitsets_from_file(open( args[2] ), lens=lens) snp_mask = clone_inverted( snp ) snp_copy = clone( snp ) print >>sys.stderr, "\nMasking AR", ar_mask = clone_inverted( ar ) print >>sys.stderr dirname = args[3] if options.mask: mask = binned_bitsets_from_file(open(options.mask), lens=lens) else: mask = None except: doc_optparse.exit() if mask: for chrom in mask.keys(): if chrom in feature: feature[chrom].iand( mask[chrom] ) if chrom in ar: ar[chrom].iand( mask[chrom] ) # divergence and snp counts for all features feature_div_count = 0 feature_snp_count = 0 ar_div_count = 0 ar_snp_count = 0 # collect snp and div for chr in feature.keys(): if chr not in snp: continue if chr not in ar: continue print >>sys.stderr, "reading %s ..." % chr, try: div = binned_bitsets_from_file( open( dirname + "/%s.bed" % (chr+suffix) ), lens=lens) except: print >>sys.stderr,"%s.bed not found" % chr continue div[chr].iand( snp_mask[chr] ) # div/snp sites count snp-only div_copy = clone( div ) print >>sys.stderr, "AR:", chr, snp[chr].iand( ar[chr] ) div[chr].iand( ar[chr] ) snp_count = snp[chr].count_range(0,snp[chr].size) ar_snp_count += snp_count print >>sys.stderr, snp_count, try: div_count = div[chr].count_range(0,div[chr].size) ar_div_count += div_count print >>sys.stderr, div_count except: print >>sys.stderr, chr, "failed" div = div_copy snp[chr] = snp_copy[chr] print >>sys.stderr, "feature:", chr, feature[chr].iand( ar_mask[chr] ) # clip to non-AR only snp[chr].iand( feature[chr] ) div[chr].iand( feature[chr] ) feature_snp_count += snp[chr].count_range(0,snp[chr].size) print >>sys.stderr, snp[chr].count_range(0,snp[chr].size), div[chr].count_range(0,div[chr].size) feature_div_count += div[chr].count_range(0,div[chr].size) print >>sys.stderr, snp[chr].count_range(0,snp[chr].size), div[chr].count_range(0,div[chr].size) # Note: can loop over feature intervals here for individual counts if chr in intervals: for start,end in intervals[chr]: ind_div_count = div[chr].count_range(start,end-start) ind_snp_count = snp[chr].count_range(start,end-start) print chr, start, end, ind_div_count, ind_snp_count print "feature snp\t%d" %feature_snp_count print "feature div\t%d" %feature_div_count print "ar snp\t%d" %ar_snp_count print "ar div\t%d" %ar_div_count
def main(): # Parse Command Line options, args = doc_optparse.parse(__doc__) try: maf_files = args if options.mincols: mincols = int(options.mincols) else: mincols = 0 if options.src: fixed_src = options.src else: fixed_src = None if options.prefix: prefix = options.prefix else: prefix = None if options.dir: dir = options.dir else: dir = None chop = bool(options.chop) do_strand = bool(options.strand) use_cache = bool(options.usecache) except: doc_optparse.exit() # Open indexed access to mafs index = bx.align.maf.MultiIndexed(maf_files, keep_open=True, parse_e_rows=True, use_cache=use_cache) # Start MAF on stdout if dir is None: out = bx.align.maf.Writer(sys.stdout) # Iterate over input ranges for line in sys.stdin: strand = None fields = line.split() if fixed_src: src, start, end = fixed_src, int(fields[0]), int(fields[1]) if do_strand: strand = fields[2] else: src, start, end = fields[0], int(fields[1]), int(fields[2]) if do_strand: strand = fields[3] if prefix: src = prefix + src # Find overlap with reference component blocks = index.get(src, start, end) # Open file if needed if dir: out = bx.align.maf.Writer( open(os.path.join(dir, "%s:%09d-%09d.maf" % (src, start, end)), 'w')) # Write each intersecting block if chop: for block in blocks: for ref in block.get_components_by_src(src): slice_start = max(start, ref.get_forward_strand_start()) slice_end = min(end, ref.get_forward_strand_end()) if (slice_end <= slice_start): continue sliced = block.slice_by_component(ref, slice_start, slice_end) # If the block is shorter than the minimum allowed size, stop if mincols and (sliced.text_size < mincols): continue # If the reference component is empty, don't write the block if sliced.get_component_by_src(src).size < 1: continue # Keep only components that are not empty sliced.components = [ c for c in sliced.components if c.size > 0 ] # Reverse complement if needed if (strand != None) and (ref.strand != strand): sliced = sliced.reverse_complement() # Write the block out.write(sliced) else: for block in blocks: out.write(block) if dir: out.close() # Close output MAF out.close() index.close()
def main(): # Parse command line options, args = doc_optparse.parse(__doc__) try: score_fname = args[0] interval_fname = args[1] chrom_col = args[2] start_col = args[3] stop_col = args[4] if len(args) > 5: out_file = open(args[5], 'w') else: out_file = sys.stdout binned = bool(options.binned) mask_fname = options.mask except Exception: doc_optparse.exit() if score_fname == 'None': stop_err( 'This tool works with data from genome builds hg16, hg17 or hg18. Click the pencil icon in your history item to set the genome build if appropriate.' ) try: chrom_col = int(chrom_col) - 1 start_col = int(start_col) - 1 stop_col = int(stop_col) - 1 except Exception: stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' ) if chrom_col < 0 or start_col < 0 or stop_col < 0: stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' ) if binned: scores_by_chrom = load_scores_ba_dir(score_fname) else: try: chrom_buffer = int(options.chrom_buffer) except Exception: chrom_buffer = 3 scores_by_chrom = load_scores_wiggle(score_fname, chrom_buffer) if mask_fname: masks = binned_bitsets_from_file(open(mask_fname)) else: masks = None skipped_lines = 0 first_invalid_line = 0 invalid_line = '' for i, line in enumerate(open(interval_fname)): valid = True line = line.rstrip('\r\n') if line and not line.startswith('#'): fields = line.split() try: chrom, start, stop = fields[chrom_col], int( fields[start_col]), int(fields[stop_col]) except Exception: valid = False skipped_lines += 1 if not invalid_line: first_invalid_line = i + 1 invalid_line = line if valid: total = 0 count = 0 min_score = 100000000 max_score = -100000000 for j in range(start, stop): if chrom in scores_by_chrom: try: # Skip if base is masked if masks and chrom in masks: if masks[chrom][j]: continue # Get the score, only count if not 'nan' score = scores_by_chrom[chrom][j] if not isnan(score): total += score count += 1 max_score = max(score, max_score) min_score = min(score, min_score) except Exception: continue if count > 0: avg = total / count else: avg = "nan" min_score = "nan" max_score = "nan" # Build the resulting line of data out_line = [] for k in range(0, len(fields)): out_line.append(fields[k]) out_line.append(avg) out_line.append(min_score) out_line.append(max_score) print("\t".join(map(str, out_line)), file=out_file) else: skipped_lines += 1 if not invalid_line: first_invalid_line = i + 1 invalid_line = line elif line.startswith('#'): # We'll save the original comments print(line, file=out_file) out_file.close() if skipped_lines > 0: print( 'Data issue: skipped %d invalid lines starting at line #%d which is "%s"' % (skipped_lines, first_invalid_line, invalid_line)) if skipped_lines == i: print( 'Consider changing the metadata for the input dataset by clicking on the pencil icon in the history item.' )
def main(): # Parse command line options, args = doc_optparse.parse(__doc__) try: score_fname = args[0] interval_fname = args[1] chrom_col = args[2] start_col = args[3] stop_col = args[4] if len(args) > 5: out_file = open(args[5], 'w') else: out_file = sys.stdout binned = bool(options.binned) mask_fname = options.mask except: doc_optparse.exit() if score_fname == 'None': stop_err('This tool works with data from genome builds hg16, hg17 or hg18. Click the pencil icon in your history item to set the genome build if appropriate.') try: chrom_col = int(chrom_col) - 1 start_col = int(start_col) - 1 stop_col = int(stop_col) - 1 except: stop_err('Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.') if chrom_col < 0 or start_col < 0 or stop_col < 0: stop_err('Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.') if binned: scores_by_chrom = load_scores_ba_dir(score_fname) else: try: chrom_buffer = int(options.chrom_buffer) except: chrom_buffer = 3 scores_by_chrom = load_scores_wiggle(score_fname, chrom_buffer) if mask_fname: masks = binned_bitsets_from_file(open(mask_fname)) else: masks = None skipped_lines = 0 first_invalid_line = 0 invalid_line = '' for i, line in enumerate(open(interval_fname)): valid = True line = line.rstrip('\r\n') if line and not line.startswith('#'): fields = line.split() try: chrom, start, stop = fields[chrom_col], int(fields[start_col]), int(fields[stop_col]) except: valid = False skipped_lines += 1 if not invalid_line: first_invalid_line = i + 1 invalid_line = line if valid: total = 0 count = 0 min_score = 100000000 max_score = -100000000 for j in range(start, stop): if chrom in scores_by_chrom: try: # Skip if base is masked if masks and chrom in masks: if masks[chrom][j]: continue # Get the score, only count if not 'nan' score = scores_by_chrom[chrom][j] if not isnan(score): total += score count += 1 max_score = max(score, max_score) min_score = min(score, min_score) except: continue if count > 0: avg = total / count else: avg = "nan" min_score = "nan" max_score = "nan" # Build the resulting line of data out_line = [] for k in range(0, len(fields)): out_line.append(fields[k]) out_line.append(avg) out_line.append(min_score) out_line.append(max_score) print("\t".join(map(str, out_line)), file=out_file) else: skipped_lines += 1 if not invalid_line: first_invalid_line = i + 1 invalid_line = line elif line.startswith('#'): # We'll save the original comments print(line, file=out_file) out_file.close() if skipped_lines > 0: print('Data issue: skipped %d invalid lines starting at line #%d which is "%s"' % (skipped_lines, first_invalid_line, invalid_line)) if skipped_lines == i: print('Consider changing the metadata for the input dataset by clicking on the pencil icon in the history item.')
def __main__(): # Parse Command Line options, args = doc_optparse.parse( __doc__ ) try: range_filename = args[ 0 ] try: refindex = int( args[ 1 ] ) refname = None except: refindex = None refname = args[ 1 ] if options.mincols: mincols = int( options.mincols ) else: mincols = 10 if options.prefix: prefix = options.prefix else: prefix = "" except: doc_optparse.exit() # Load Intervals intersecters = dict() for line in file( range_filename ): fields = line.split() src = prefix + fields[0] if not src in intersecters: intersecters[src] = intervals.Intersecter() intersecters[src].add_interval( intervals.Interval( int( fields[1] ), int( fields[2] ) ) ) # Start MAF on stdout out = bx.align.maf.Writer( sys.stdout ) # Iterate over input MAF for maf in bx.align.maf.Reader( sys.stdin ): if refname: sourcenames = [ cmp.src.split('.')[0] for cmp in maf.components ] try: refindex = sourcenames.index( refname ) except: continue ref_component = maf.components[ refindex ] # Find overlap with reference component if not ( ref_component.src in intersecters ): continue intersections = intersecters[ ref_component.src ].find( ref_component.start, ref_component.end ) # Keep output maf ordered intersections.sort() # Write each intersecting block for interval in intersections: start = max( interval.start, ref_component.start ) end = min( interval.end, ref_component.end ) sliced = maf.slice_by_component( refindex, start, end ) good = True for c in sliced.components: if c.size < 1: good = False if good and sliced.text_size > mincols: out.write( sliced ) # Close output MAF out.close()