def __init__(self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs): NiceReaderWrapper.__init__(self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, strand_col=strand_col, fix_strand=fix_strand, **kwargs) self.feature_col = feature_col self.score_col = score_col self.convert_to_bed_coord = convert_to_bed_coord self.last_line = None self.cur_offset = 0 self.seed_interval = None self.seed_interval_line_len = 0
def main(): options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2) in_fname, in2_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) g2 = NiceReaderWrapper(fileinput.FileInput(in2_fname), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True) out_file = open(out_fname, "w") try: for line in coverage([g1, g2]): if type(line) is GenomicInterval: out_file.write("%s\n" % "\t".join(line.fields)) else: out_file.write("%s\n" % line) except ParseError, exc: out_file.close() fail("Invalid file format: %s" % str(exc))
def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs ): NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, strand_col=strand_col, fix_strand=fix_strand, **kwargs ) self.feature_col = feature_col self.score_col = score_col self.convert_to_bed_coord = convert_to_bed_coord self.last_line = None self.cur_offset = 0 self.seed_interval = None self.seed_interval_line_len = 0
def subtract_files(cls, fn1, fn2, out_fn): g1 = NiceReaderWrapper(fileinput.FileInput(fn1), fix_strand=True) g2 = NiceReaderWrapper(fileinput.FileInput(fn2), fix_strand=True) out_file = open(out_fn, "w") try: for feature in subtract([g1, g2], pieces=True, mincols=1): out_file.write("%s\n" % feature) except ParseError, exc: out_file.close() fail("Invalid file format: %s" % str(exc))
def main(): mincols = 1 leftfill = False rightfill = False options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2) if options.mincols: mincols = int(options.mincols) if options.fill: if options.fill == "both": rightfill = leftfill = True else: rightfill = options.fill == "right" leftfill = options.fill == "left" in_fname, in2_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) g2 = NiceReaderWrapper(fileinput.FileInput(in2_fname), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True) out_file = open(out_fname, "w") try: for outfields in join(g1, g2, mincols=mincols, rightfill=rightfill, leftfill=leftfill): if type(outfields) is list: out_file.write("%s\n" % "\t".join(outfields)) else: out_file.write("%s\n" % outfields) except ParseError, exc: out_file.close() fail("Invalid file format: %s" % str(exc))
def main(): distance = 0 minregions = 2 output = 1 options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) if options.distance: distance = int(options.distance) if options.overlap: distance = -1 * int(options.overlap) if options.output: output = int(options.output) if options.minregions: minregions = int(options.minregions) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) # Get the cluster tree try: clusters, extra = find_clusters(g1, mincols=distance, minregions=minregions) except ParseError, exc: fail("Invalid file format: %s" % str(exc))
def main(): skipped = 0 not_pairwise = 0 if int_file == "None": try: maf_reader = bx.align.maf.Reader( open(inp_file, 'r') ) except: stop_err("Your MAF file appears to be malformed.") print >> fout, "#Seq1\tStart1\tEnd1\tSeq2\tStart2\tEnd2\tL\tN\tp" for block in maf_reader: if len(block.components) != 2: not_pairwise += 1 continue try: rateEstimator(block) except: skipped += 1 else: index, index_filename = maf_utilities.build_maf_index( inp_file, species = [dbkey_i] ) if index is None: print >> sys.stderr, "Your MAF file appears to be malformed." sys.exit() win = NiceReaderWrapper( fileinput.FileInput( int_file ), chrom_col=chr_col_i, start_col=start_col_i, end_col=end_col_i, strand_col=strand_col_i, fix_strand=True) species = None mincols = 0 global alignlen, mismatches for interval in win: alignlen = 0 mismatches = 0.0 src = "%s.%s" % ( dbkey_i, interval.chrom ) for block in maf_utilities.get_chopped_blocks_for_region( index, src, interval, species, mincols ): if len(block.components) != 2: not_pairwise += 1 continue try: rateEstimator(block) except: skipped += 1 if alignlen: p = mismatches/alignlen else: p = 'NA' interval.fields.append(str(alignlen)) interval.fields.append(str(mismatches)) interval.fields.append(str(p)) print >> fout, "\t".join(interval.fields) #num_blocks += 1 if not_pairwise: print "Skipped %d non-pairwise blocks" % (not_pairwise) if skipped: print "Skipped %d blocks as invalid" % (skipped)
def main(): sameformat = False options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) if options.sameformat: sameformat = True in_file_1, in_file_2, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_file_1 ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) g2 = NiceReaderWrapper( fileinput.FileInput( in_file_2 ), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True ) out_file = open( out_fname, "w" ) try: for line in concat( [g1, g2], sameformat=sameformat ): if type( line ) is GenomicInterval: out_file.write( "%s\n" % "\t".join( line.fields ) ) else: out_file.write( "%s\n" % line ) except ParseError as exc: out_file.close() fail( "Invalid file format: %s" % str( exc ) ) out_file.close() if g1.skipped > 0: print(skipped( g1, filedesc=" of 1st dataset" )) if g2.skipped > 0: print(skipped( g2, filedesc=" of 2nd dataset" ))
def main(): mincols = 1 options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) if options.mincols: mincols = int(options.mincols) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) out_file = open(out_fname, "w") try: for line in merge(g1, mincols=mincols): if options.threecol: if type(line) is GenomicInterval: out_file.write( "%s\t%s\t%s\n" % (line.chrom, str(line.startCol), str(line.endCol))) elif type(line) is list: out_file.write("%s\t%s\t%s\n" % (line[chr_col_1], str(line[start_col_1]), str(line[end_col_1]))) else: out_file.write("%s\n" % line) else: if type(line) is GenomicInterval: out_file.write("%s\n" % "\t".join(line.fields)) elif type(line) is list: out_file.write("%s\n" % "\t".join(line)) else: out_file.write("%s\n" % line) except ParseError as exc: out_file.close() fail("Invalid file format: %s" % str(exc)) out_file.close() if g1.skipped > 0: print(skipped(g1, filedesc=" of 1st dataset"))
def main(): options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) try: bases = base_coverage(g1) except ParseError, exc: fail("Invalid file format: %s" % str(exc))
def main(): options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) try: bases = base_coverage(g1) except ParseError as exc: fail( "Invalid file format: %s" % str( exc ) ) out_file = open( out_fname, "w" ) out_file.write( "%s\n" % str( bases ) ) out_file.close() if g1.skipped > 0: print(skipped( g1, filedesc="" ))
def main(): allchroms = False options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) lengths = options.lengths if options.all: allchroms = True in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) lens = dict() chroms = list() # dbfile is used to determine the length of each chromosome. The lengths # are added to the lens dict and passed copmlement operation code in bx. dbfile = fileinput.FileInput(lengths) if dbfile: if not allchroms: try: for line in dbfile: fields = line.split("\t") lens[fields[0]] = int(fields[1]) except: # assume LEN doesn't exist or is corrupt somehow pass elif allchroms: try: for line in dbfile: fields = line.split("\t") end = int(fields[1]) chroms.append("\t".join([fields[0], "0", str(end)])) except: pass # Safety...if the dbfile didn't exist and we're on allchroms, then # default to generic complement if allchroms and len(chroms) == 0: allchroms = False if allchroms: chromReader = GenomicIntervalReader(chroms) generator = subtract([chromReader, g1]) else: generator = complement(g1, lens) out_file = open(out_fname, "w") try: for interval in generator: if type(interval) is GenomicInterval: out_file.write("%s\n" % "\t".join(interval)) else: out_file.write("%s\n" % interval) except ParseError, exc: out_file.close() fail("Invalid file format: %s" % str(exc))
def main(): distance = 0 minregions = 2 output = 1 options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) if options.distance: distance = int(options.distance) if options.overlap: distance = -1 * int(options.overlap) if options.output: output = int(options.output) if options.minregions: minregions = int(options.minregions) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) # Get the cluster tree try: clusters, extra = find_clusters(g1, mincols=distance, minregions=minregions) except ParseError as exc: fail("Invalid file format: %s" % str(exc)) f1 = open(in_fname, "r") out_file = open(out_fname, "w") # If "merge" if output == 1: fields = [ "." for x in range(max(g1.chrom_col, g1.start_col, g1.end_col) + 1) ] for chrom, tree in clusters.items(): for start, end, lines in tree.getregions(): fields[g1.chrom_col] = chrom fields[g1.start_col] = str(start) fields[g1.end_col] = str(end) out_file.write("%s\n" % "\t".join(fields)) # If "filtered" we preserve order of file and comments, etc. if output == 2: linenums = dict() for chrom, tree in clusters.items(): for linenum in tree.getlines(): linenums[linenum] = 0 linenum = -1 f1.seek(0) for line in f1.readlines(): linenum += 1 if linenum in linenums or linenum in extra: out_file.write("%s\n" % line.rstrip("\n\r")) # If "clustered" we output original intervals, but near each other (i.e. clustered) if output == 3: linenums = list() f1.seek(0) fileLines = f1.readlines() for chrom, tree in clusters.items(): for linenum in tree.getlines(): out_file.write("%s\n" % fileLines[linenum].rstrip("\n\r")) # If "minimum" we output the smallest interval in each cluster if output == 4 or output == 5: linenums = list() f1.seek(0) fileLines = f1.readlines() for chrom, tree in clusters.items(): for start, end, lines in tree.getregions(): outsize = -1 outinterval = None for line in lines: # three nested for loops? # should only execute this code once per line fileline = fileLines[line].rstrip("\n\r") try: cluster_interval = GenomicInterval( g1, fileline.split("\t"), g1.chrom_col, g1.start_col, g1.end_col, g1.strand_col, g1.default_strand, g1.fix_strand) except Exception as exc: print(str(exc), file=sys.stderr) f1.close() sys.exit() interval_size = cluster_interval.end - cluster_interval.start if outsize == -1 or \ ( outsize > interval_size and output == 4 ) or \ ( outsize < interval_size and output == 5 ): outinterval = cluster_interval outsize = interval_size out_file.write("%s\n" % outinterval) f1.close() out_file.close() if g1.skipped > 0: print(skipped(g1, filedesc=""))