def main(): options, args = doc_optparse.parse(__doc__) try: sources = args[0].translate(tree_tx).split() ref_2bit = bx.seq.twobit.TwoBitFile(open(args[1])) index = maf.MultiIndexed(args[2:]) out = maf.Writer(sys.stdout) missing_data = bool(options.missingData) use_strand = bool(options.strand) except: doc_optparse.exception() for line in sys.stdin: fields = line.split() ref_src, start, end = fields[0:3] if use_strand and len(fields) > 5: strand = fields[5] else: strand = '+' do_interval(sources, index, out, ref_src, int(start), int(end), ref_2bit, missing_data, strand) out.close()
def main(): options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2) in_fname, in2_fname, out_fname, direction = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) g2 = NiceReaderWrapper(fileinput.FileInput(in2_fname), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True) out_file = open(out_fname, "w") try: for line in proximal_region_finder([g1, g2], direction): if type(line) is list: out_file.write("%s\n" % "\t".join(line)) else: out_file.write("%s\n" % line) except ParseError, exc: fail("Invalid file format: %s" % str(exc))
def main(): # Parse command line options, args = doc_optparse.parse(__doc__) try: data_fname, model_fname, out_fname = args window = int(getopt(options, "window", 100)) shift = int(getopt(options, "shift", 5)) low = float(getopt(options, "low", -1.0)) high = float(getopt(options, "high", 1.0)) if options.mapping: align_count, mapping = rp.mapping.alignment_mapping_from_file(file(options.mapping)) else: mapping = None modname = getattr(options, "model") if modname is None: modname = "standard" reorder = getopt(options, "reorder", None) if reorder: reorder = map(int, reorder.split(",")) except: doc_optparse.exception() out = open(out_fname, "w") run(open(data_fname), modname, open(model_fname), out, mapping, window, shift, low, high, reorder) out.close()
def main(): mincols = 1 options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) if options.mincols: mincols = int( options.mincols ) pieces = bool( options.pieces ) in1_gff_format = bool( options.gff1 ) in2_gff_format = bool( options.gff2 ) in_fname, in2_fname, out_fname = args except: doc_optparse.exception() # Set readers to handle either GFF or default format. if in1_gff_format: in1_reader_wrapper = GFFReaderWrapper else: in1_reader_wrapper = NiceReaderWrapper if in2_gff_format: in2_reader_wrapper = GFFReaderWrapper else: in2_reader_wrapper = NiceReaderWrapper g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) if in1_gff_format: # Intersect requires coordinates in BED format. g1.convert_to_bed_coord = True g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True ) if in2_gff_format: # Intersect requires coordinates in BED format. g2.convert_to_bed_coord = True out_file = open( out_fname, "w" ) try: for feature in intersect( [g1, g2], pieces=pieces, mincols=mincols ): if isinstance( feature, GFFFeature ): # Convert back to GFF coordinates since reader converted automatically. convert_bed_coords_to_gff( feature ) for interval in feature.intervals: out_file.write( "%s\n" % "\t".join( interval.fields ) ) elif isinstance( feature, GenomicInterval ): out_file.write( "%s\n" % "\t".join( feature.fields ) ) else: out_file.write( "%s\n" % feature ) except ParseError, e: out_file.close() fail( "Invalid file format: %s" % str( e ) )
def main(): options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) in_fname, in2_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True ) out_file = open( out_fname, "w" ) try: for line in coverage( [g1, g2] ): if type( line ) is GenomicInterval: out_file.write( "%s\n" % "\t".join( line.fields ) ) else: out_file.write( "%s\n" % line ) except ParseError, exc: out_file.close() fail( "Invalid file format: %s" % str( exc ) )
def main(): options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) in_fname, in2_fname, out_fname, direction = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True ) out_file = open( out_fname, "w" ) try: for line in proximal_region_finder([g1,g2], direction): if type( line ) is list: out_file.write( "%s\n" % "\t".join( line ) ) else: out_file.write( "%s\n" % line ) except ParseError, exc: fail( "Invalid file format: %s" % str( exc ) )
def main(): distance = 0 minregions = 2 output = 1 options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) if options.distance: distance = int(options.distance) if options.overlap: distance = -1 * int(options.overlap) if options.output: output = int(options.output) if options.minregions: minregions = int(options.minregions) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) # Get the cluster tree try: clusters, extra = find_clusters(g1, mincols=distance, minregions=minregions) except ParseError, exc: fail("Invalid file format: %s" % str(exc))
def __main__(): # Parse command line arguments options, args = doc_optparse.parse( __doc__ ) try: keep_header = bool( options.header ) keep_comments = bool( options.comments ) expr = args[0] colname = args[1] except: doc_optparse.exception() # Compile expression for SPEED if expr: expr = compile( expr, '<expr arg>', 'eval' ) for element in bx.tabular.io.Reader( sys.stdin ): if type( element ) is bx.tabular.io.Header: if keep_header: print str( element ) + "\t" + colname elif type( element ) is bx.tabular.io.Comment: if keep_comments: print element else: val = eval( expr, dict( row=element ) ) print str( element ) + "\t" + str( val )
def main(): distance = 0 minregions = 2 output = 1 options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) if options.distance: distance = int( options.distance ) if options.overlap: distance = -1 * int( options.overlap ) if options.output: output = int( options.output ) if options.minregions: minregions = int( options.minregions ) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) # Get the cluster tree try: clusters, extra = find_clusters( g1, mincols=distance, minregions=minregions) except ParseError, exc: fail( "Invalid file format: %s" % str( exc ) )
def main(): options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2) in_fname, in2_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) g2 = NiceReaderWrapper(fileinput.FileInput(in2_fname), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True) out_file = open(out_fname, "w") try: for line in coverage([g1, g2]): if type(line) is GenomicInterval: out_file.write("%s\n" % "\t".join(line.fields)) else: out_file.write("%s\n" % line) except ParseError, exc: out_file.close() fail("Invalid file format: %s" % str(exc))
def __main__(): # Parse command line arguments options, args = doc_optparse.parse(__doc__) try: keep_header = bool(options.header) keep_comments = bool(options.comments) expr = args[0] colname = args[1] except: doc_optparse.exception() # Compile expression for SPEED if expr: expr = compile(expr, '<expr arg>', 'eval') for element in bx.tabular.io.Reader(sys.stdin): if type(element) is bx.tabular.io.Header: if keep_header: print str(element) + "\t" + colname elif type(element) is bx.tabular.io.Comment: if keep_comments: print element else: val = eval(expr, dict(row=element)) print str(element) + "\t" + str(val)
def __main__(): # # Parse options, args. # options, args = doc_optparse.parse( __doc__ ) try: if len(options.cols.split(',')) == 5: # BED file chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg( options.cols ) else: # gff file chrom_col, start_col, end_col, strand_col = parse_cols_arg( options.cols ) name_col = False dbkey = options.dbkey output_format = options.output_format gff_format = options.gff interpret_features = options.interpret_features GALAXY_DATA_INDEX_DIR = options.GALAXY_DATA_INDEX_DIR fasta_file = options.fasta input_filename, output_filename = args except: doc_optparse.exception() includes_strand_col = strand_col >= 0 strand = None nibs = {} # # Set path to sequence data. # if fasta_file: # Need to create 2bit file from fasta file. try: seq_path = tempfile.NamedTemporaryFile( dir="." ).name cmd = "faToTwoBit %s %s" % ( fasta_file, seq_path ) tmp_name = tempfile.NamedTemporaryFile( dir="." ).name tmp_stderr = open( tmp_name, 'wb' ) proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() ) returncode = proc.wait() tmp_stderr.close() # Get stderr, allowing for case where it's very large. tmp_stderr = open( tmp_name, 'rb' ) stderr = '' buffsize = 1048576 try: while True: stderr += tmp_stderr.read( buffsize ) if not stderr or len( stderr ) % buffsize != 0: break except OverflowError: pass tmp_stderr.close() # Error checking. if returncode != 0: raise Exception(stderr) except Exception, e: stop_err( 'Error running faToTwoBit. ' + str( e ) )
def main(): # Parse command line options, args = doc_optparse.parse( __doc__ ) try: maf_file = args[0] # If it appears to be a bz2 file, attempt to open with table if maf_file.endswith( ".bz2" ): table_file = maf_file + "t" if not os.path.exists( table_file ): doc_optparse.exit( "To index bz2 compressed files first " "create a bz2t file with bzip-table." ) # Open with SeekableBzip2File so we have tell support maf_in = SeekableBzip2File( maf_file, table_file ) # Strip .bz2 from the filename before adding ".index" maf_file = maf_file[:-4] elif maf_file.endswith( ".lzo" ): from bx.misc.seeklzop import SeekableLzopFile table_file = maf_file + "t" if not os.path.exists( table_file ): doc_optparse.exit( "To index lzo compressed files first " "create a lzot file with lzop_build_offset_table." ) # Open with SeekableBzip2File so we have tell support maf_in = SeekableLzopFile( maf_file, table_file ) # Strip .lzo from the filename before adding ".index" maf_file = maf_file[:-4] else: maf_in = open( maf_file ) # Determine the name of the index file if len( args ) > 1: index_file = args[1] else: index_file = maf_file + ".index" if options.species: species = options.species.split( "," ) else: species = None except: doc_optparse.exception() maf_reader = bx.align.maf.Reader( maf_in ) indexes = interval_index_file.Indexes() # Need to be a bit tricky in our iteration here to get the 'tells' right while 1: pos = maf_reader.file.tell() block = maf_reader.next() if block is None: break for c in block.components: if species is not None and c.src.split('.')[0] not in species: continue indexes.add( c.src, c.forward_strand_start, c.forward_strand_end, pos, max=c.src_size ) out = open( index_file, 'w' ) indexes.write( out ) out.close()
def main(): # Parse command line options, args = doc_optparse.parse( __doc__ ) try: h5_fname = args[0] mapping_fname = args[1] in_fname = args[2] out_fname = args[3] chrom_col, start_col, end_col = map( lambda x: int( x ) - 1, args[4:7] ) per_col = bool( options.perCol ) except Exception, e: doc_optparse.exception()
def main(): # Parse command line options, args = doc_optparse.parse(__doc__) try: h5_fname = args[0] mapping_fname = args[1] in_fname = args[2] out_fname = args[3] chrom_col, start_col, end_col = map(lambda x: int(x) - 1, args[4:7]) per_col = bool(options.perCol) except Exception, e: doc_optparse.exception()
def main(): mincols = 1 upstream_pad = 0 downstream_pad = 0 leftfill = False rightfill = False options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2) if options.mincols: mincols = int(options.mincols) if options.fill: if options.fill == "both": rightfill = leftfill = True else: rightfill = options.fill == "right" leftfill = options.fill == "left" in_fname, in2_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) g2 = NiceReaderWrapper(fileinput.FileInput(in2_fname), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True) out_file = open(out_fname, "w") try: for outfields in join(g1, g2, mincols=mincols, rightfill=rightfill, leftfill=leftfill): if type(outfields) is list: out_file.write("%s\n" % "\t".join(outfields)) else: out_file.write("%s\n" % outfields) except ParseError, exc: out_file.close() fail("Invalid file format: %s" % str(exc))
def main(): mincols = 1 options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) if options.mincols: mincols = int(options.mincols) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) out_file = open(out_fname, "w") try: for line in merge(g1, mincols=mincols): if options.threecol: if type(line) is GenomicInterval: out_file.write( "%s\t%s\t%s\n" % (line.chrom, str(line.startCol), str(line.endCol))) elif type(line) is list: out_file.write("%s\t%s\t%s\n" % (line[chr_col_1], str(line[start_col_1]), str(line[end_col_1]))) else: out_file.write("%s\n" % line) else: if type(line) is GenomicInterval: out_file.write("%s\n" % "\t".join(line.fields)) elif type(line) is list: out_file.write("%s\n" % "\t".join(line)) else: out_file.write("%s\n" % line) except ParseError as exc: out_file.close() fail("Invalid file format: %s" % str(exc)) out_file.close() if g1.skipped > 0: print(skipped(g1, filedesc=" of 1st dataset"))
def __main__(): # Parse command line arguments options, args = doc_optparse.parse(__doc__) try: keep_header = bool(options.header) keep_comments = bool(options.comments) cols = [] if options.cols: for c in options.cols.split(','): try: v = int(c) except ValueError: v = c cols.append(v) if len(args) > 0: expr = args[0] else: expr = None if options.force_header: force_header = bx.tabular.io.FIRST_LINE_IS_HEADER else: force_header = None except Exception: doc_optparse.exception() # Compile expression for SPEED if expr: expr = compile(expr, '<expr arg>', 'eval') for element in bx.tabular.io.TableReader(sys.stdin, force_header=force_header): if isinstance(element, bx.tabular.io.Header): if keep_header: if cols: print("#" + "\t".join(element[c] for c in cols)) else: print(element) elif isinstance(element, bx.tabular.io.Comment): if keep_comments: print(element) else: if expr is None or bool(eval(expr, dict(row=element))): if cols: print("\t".join(element[c] for c in cols)) else: print(element)
def main(): mincols = 1 upstream_pad = 0 downstream_pad = 0 leftfill = False rightfill = False options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) if options.mincols: mincols = int( options.mincols ) if options.fill: if options.fill == "both": rightfill = leftfill = True else: rightfill = options.fill == "right" leftfill = options.fill == "left" in_fname, in2_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True ) out_file = open( out_fname, "w" ) try: for outfields in join(g1, g2, mincols=mincols, rightfill=rightfill, leftfill=leftfill): if type( outfields ) is list: out_file.write( "%s\n" % "\t".join( outfields ) ) else: out_file.write( "%s\n" % outfields ) except ParseError, exc: out_file.close() fail( "Invalid file format: %s" % str( exc ) )
def main(): options, args = doc_optparse.parse( __doc__ ) try: extension = options.ext except: doc_optparse.exception() # create datatype data = model.Dataset( extension=extension, id=int( args[0] ) ) data.file_path = "/home/ian/trunk/database/files/" if options.metadata: data.metadata = util.string_to_object( options.metadata ) errors = data.datatype.validate( data ) print util.object_to_string(errors)
def __main__(): # Parse command line arguments options, args = doc_optparse.parse(__doc__) try: keep_header = bool(options.header) keep_comments = bool(options.comments) cols = [] if options.cols: for c in options.cols.split(","): try: v = int(c) except: v = c cols.append(c) if len(args) > 0: expr = args[0] else: expr = None if options.force_header: force_header = bx.tabular.io.FIRST_LINE_IS_HEADER else: force_header = None except: doc_optparse.exception() # Compile expression for SPEED if expr: expr = compile(expr, "<expr arg>", "eval") for element in bx.tabular.io.TableReader(sys.stdin, force_header=force_header): if type(element) is bx.tabular.io.Header: if keep_header: if cols: print "#" + "\t".join(element[c] for c in cols) else: print element elif type(element) is bx.tabular.io.Comment: if keep_comments: print element else: if expr is None or bool(eval(expr, dict(row=element))): if cols: print "\t".join([element[c] for c in cols]) else: print element
def main(): sameformat = False options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) if options.sameformat: sameformat = True in_file_1, in_file_2, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_file_1 ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) g2 = NiceReaderWrapper( fileinput.FileInput( in_file_2 ), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True ) out_file = open( out_fname, "w" ) try: for line in concat( [g1, g2], sameformat=sameformat ): if type( line ) is GenomicInterval: out_file.write( "%s\n" % "\t".join( line.fields ) ) else: out_file.write( "%s\n" % line ) except ParseError as exc: out_file.close() fail( "Invalid file format: %s" % str( exc ) ) out_file.close() if g1.skipped > 0: print(skipped( g1, filedesc=" of 1st dataset" )) if g2.skipped > 0: print(skipped( g2, filedesc=" of 2nd dataset" ))
def main(): mincols = 1 options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) if options.mincols: mincols = int( options.mincols ) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) out_file = open( out_fname, "w" ) try: for line in merge(g1, mincols=mincols): if options.threecol: if type( line ) is GenomicInterval: out_file.write( "%s\t%s\t%s\n" % ( line.chrom, str( line.startCol ), str( line.endCol ) ) ) elif type( line ) is list: out_file.write( "%s\t%s\t%s\n" % ( line[chr_col_1], str( line[start_col_1] ), str( line[end_col_1] ) ) ) else: out_file.write( "%s\n" % line ) else: if type( line ) is GenomicInterval: out_file.write( "%s\n" % "\t".join( line.fields ) ) elif type( line ) is list: out_file.write( "%s\n" % "\t".join( line ) ) else: out_file.write( "%s\n" % line ) except ParseError as exc: out_file.close() fail( "Invalid file format: %s" % str( exc ) ) out_file.close() if g1.skipped > 0: print(skipped( g1, filedesc=" of 1st dataset" ))
def main(): options, args = doc_optparse.parse( __doc__ ) try: sources = args[0].translate( tree_tx ).split() seq_db = load_seq_db( args[1] ) index = bx.align.maf.MultiIndexed( args[2:] ) out = bx.align.maf.Writer( sys.stdout ) missing_data = bool(options.missingData) except: doc_optparse.exception() for line in sys.stdin: ref_src, start, end = line.split()[0:3] do_interval( sources, index, out, ref_src, int( start ), int( end ), seq_db, missing_data ) out.close()
def main(): # Parse command line arguments options, args = doc_optparse.parse(__doc__) try: template = Template(args[0]) format = options.format if not format: format = "maf" except: doc_optparse.exception() reader = align.get_reader(format, sys.stdin) for a in reader: template.a = a template.c = a.components print template
def main(): # Parse command line arguments options, args = doc_optparse.parse( __doc__ ) try: template = Template( args[0] ) format = options.format if not format: format = "maf" except: doc_optparse.exception() reader = align.get_reader( format, sys.stdin ) for a in reader: template.a = a template.c = a.components print template
def main(): options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) try: bases = base_coverage(g1) except ParseError, exc: fail( "Invalid file format: %s" % str( exc ) )
def main(): options, args = doc_optparse.parse( __doc__ ) try: if options.mask: mask = options.mask else: mask = "?" except: doc_optparse.exception() reader = bx.align.maf.Reader( sys.stdin ) writer = bx.align.maf.Writer( sys.stdout ) if options.restricted: cpgfilter = bx.align.sitemask.cpg.Restricted( mask=mask ) else: cpgfilter = bx.align.sitemask.cpg.Inclusive( mask=mask ) cpgfilter.run( reader, writer.write ) print >> sys.stderr, str( float(cpgfilter.masked)/float(cpgfilter.total) * 100 ) + "% bases masked."
def main(): options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) try: bases = base_coverage(g1) except ParseError, exc: fail("Invalid file format: %s" % str(exc))
def main(): mincols = 1 upstream_pad = 0 downstream_pad = 0 options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2) if options.mincols: mincols = int(options.mincols) pieces = bool(options.pieces) in_fname, in2_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) g2 = NiceReaderWrapper(fileinput.FileInput(in2_fname), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True) out_file = open(out_fname, "w") try: for line in intersect([g1, g2], pieces=pieces, mincols=mincols): if type(line) == GenomicInterval: out_file.write("%s\n" % "\t".join(line.fields)) else: out_file.write("%s\n" % line) except ParseError, e: out_file.close() fail("Invalid file format: %s" % str(e))
def main(): sameformat=False upstream_pad = 0 downstream_pad = 0 options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) if options.sameformat: sameformat = True in_file_1, in_file_2, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_file_1 ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, fix_strand=True ) g2 = NiceReaderWrapper( fileinput.FileInput( in_file_2 ), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True ) if strand_col_1 >= 0: g1.strand_col = strand_col_1 out_file = open( out_fname, "w" ) try: for line in concat( [g1, g2], sameformat=sameformat ): if type( line ) is GenomicInterval: out_file.write( "%s\n" % "\t".join( line.fields ) ) else: out_file.write( "%s\n" % line ) except ParseError, exc: out_file.close() fail( "Invalid file format: %s" % str( exc ) )
def main(): options, args = doc_optparse.parse(__doc__) try: sources = args[0].translate(tree_tx).split() seq_db = load_seq_db(args[1]) index = maf.MultiIndexed(args[2:]) out = maf.Writer(sys.stdout) missing_data = bool(options.missingData) use_strand = bool(options.strand) except: doc_optparse.exception() for line in sys.stdin: fields = line.split() ref_src, start, end = fields[0:3] if use_strand and len(fields) > 5: strand = fields[5] else: strand = "+" do_interval(sources, index, out, ref_src, int(start), int(end), seq_db, missing_data, strand) out.close()
def main(): options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) try: bases = base_coverage(g1) except ParseError as exc: fail( "Invalid file format: %s" % str( exc ) ) out_file = open( out_fname, "w" ) out_file.write( "%s\n" % str( bases ) ) out_file.close() if g1.skipped > 0: print(skipped( g1, filedesc="" ))
def main(): # Parsing Command Line here options, args = doc_optparse.parse( __doc__ ) try: inp1_file, inp2_file, begin_col, end_col, out_file = args except: doc_optparse.exception() begin_col = begin_col.strip() end_col = end_col.strip() if begin_col != 'None' or end_col != 'None': """ The user selected columns for restriction. We'll allow default values for both begin_col and end_col as long as the user selected at least one of them for restriction. """ if begin_col == 'None': begin_col = end_col elif end_col == 'None': end_col = begin_col begin_col = int(begin_col) end_col = int(end_col) """Make sure that begin_col <= end_col (switch if not)""" if begin_col > end_col: tmp_col = end_col end_col = begin_col begin_col = tmp_col else: begin_col = end_col = '' try: fo = open(out_file,'w') except: print >> sys.stderr, "Unable to open output file" sys.exit() """ len1 is the number of lines in inp1_file lines1 is the set of unique lines in inp1_file diff1 is the number of duplicate lines removed from inp1_file """ len1, lines1 = get_lines(inp1_file, begin_col, end_col, options.ignore_empty_end_cols) diff1 = len1 - len(lines1) len2, lines2 = get_lines(inp2_file, begin_col, end_col, options.ignore_empty_end_cols) lines1.difference_update(lines2) """lines1 is now the set of unique lines in inp1_file - the set of unique lines in inp2_file""" for line in lines1: print >> fo, line fo.close() info_msg = 'Subtracted %d lines. ' %((len1 - diff1) - len(lines1)) if begin_col and end_col: info_msg += 'Restricted to columns c' + str(begin_col) + ' thru c' + str(end_col) + '. ' if diff1 > 0: info_msg += 'Eliminated %d duplicate/blank/comment/invalid lines from first query.' %diff1 print info_msg
def main(): # Parse command line options, args = doc_optparse.parse(__doc__) if options.version: return try: wiggle_file = args[0] # If it appears to be a bz2 file, attempt to open with table if wiggle_file.endswith(".bz2"): table_file = wiggle_file + "t" if not os.path.exists(table_file): doc_optparse.exit("To index bz2 compressed files first " "create a bz2t file with bzip-table.") # Open with SeekableBzip2File so we have tell support wiggle_in = SeekableBzip2File(wiggle_file, table_file) # Strip .bz2 from the filename before adding ".index" wiggle_file = wiggle_file[:-4] elif wiggle_file.endswith(".lzo"): from bx.misc.seeklzop import SeekableLzopFile table_file = wiggle_file + "t" if not os.path.exists(table_file): doc_optparse.exit("To index lzo compressed files first " "create a lzot file with bzip-table.") # Open with SeekableBzip2File so we have tell support wiggle_in = SeekableLzopFile(wiggle_file, table_file) # Strip .lzo from the filename before adding ".index" wiggle_file = wiggle_file[:-4] else: wiggle_in = open(wiggle_file) # Determine the name of the index file if len(args) > 1: index_file = args[1] else: index_file = wiggle_file + ".index" except: doc_optparse.exception() indexes = interval_index_file.Indexes() # Can't use the iterator, as there is no next() and thus # no way to access the positions. The following code is # modified from wiggle.py last_chrom = None start = None end = None first_pos = None # always for wiggle data strand = '+' mode = "bed" while 1: pos = wiggle_in.tell() line = wiggle_in.readline() if not line: break if line.isspace() or line.startswith("track") or line.startswith( "#") or line.startswith("browser"): continue elif line.startswith("bed"): indexes.add(fields[0], int(fields[1]), int(fields[2]), pos) elif line.startswith("variableStep") or line.startswith("fixedStep"): if first_pos != None: indexes.add(last_chrom, start, end, first_pos) first_pos = pos header = bx.wiggle.parse_header(line) last_chrom = header['chrom'] start = int(header['start']) - 1 end = start current_step = None if 'span' in header: current_span = int(header['span']) else: current_span = 1 if 'step' in header: current_step = int(header['step']) if line.startswith("variableStep"): mode = "variableStep" else: mode = "fixedStep" elif mode == "variableStep": fields = line.split() end = int(fields[0]) - 1 + current_span elif mode == "fixedStep": end += current_step else: raise "Unexpected input line: %s" % line.strip() out = open(index_file, 'w') indexes.write(out) out.close()
def main(): distance = 0 minregions = 2 output = 1 options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) if options.distance: distance = int(options.distance) if options.overlap: distance = -1 * int(options.overlap) if options.output: output = int(options.output) if options.minregions: minregions = int(options.minregions) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) # Get the cluster tree try: clusters, extra = find_clusters(g1, mincols=distance, minregions=minregions) except ParseError as exc: fail("Invalid file format: %s" % str(exc)) f1 = open(in_fname, "r") out_file = open(out_fname, "w") # If "merge" if output == 1: fields = [ "." for x in range(max(g1.chrom_col, g1.start_col, g1.end_col) + 1) ] for chrom, tree in clusters.items(): for start, end, lines in tree.getregions(): fields[g1.chrom_col] = chrom fields[g1.start_col] = str(start) fields[g1.end_col] = str(end) out_file.write("%s\n" % "\t".join(fields)) # If "filtered" we preserve order of file and comments, etc. if output == 2: linenums = dict() for chrom, tree in clusters.items(): for linenum in tree.getlines(): linenums[linenum] = 0 linenum = -1 f1.seek(0) for line in f1.readlines(): linenum += 1 if linenum in linenums or linenum in extra: out_file.write("%s\n" % line.rstrip("\n\r")) # If "clustered" we output original intervals, but near each other (i.e. clustered) if output == 3: linenums = list() f1.seek(0) fileLines = f1.readlines() for chrom, tree in clusters.items(): for linenum in tree.getlines(): out_file.write("%s\n" % fileLines[linenum].rstrip("\n\r")) # If "minimum" we output the smallest interval in each cluster if output == 4 or output == 5: linenums = list() f1.seek(0) fileLines = f1.readlines() for chrom, tree in clusters.items(): for start, end, lines in tree.getregions(): outsize = -1 outinterval = None for line in lines: # three nested for loops? # should only execute this code once per line fileline = fileLines[line].rstrip("\n\r") try: cluster_interval = GenomicInterval( g1, fileline.split("\t"), g1.chrom_col, g1.start_col, g1.end_col, g1.strand_col, g1.default_strand, g1.fix_strand) except Exception as exc: print(str(exc), file=sys.stderr) f1.close() sys.exit() interval_size = cluster_interval.end - cluster_interval.start if outsize == -1 or \ ( outsize > interval_size and output == 4 ) or \ ( outsize < interval_size and output == 5 ): outinterval = cluster_interval outsize = interval_size out_file.write("%s\n" % outinterval) f1.close() out_file.close() if g1.skipped > 0: print(skipped(g1, filedesc=""))
def main(): options, args = doc_optparse.parse( __doc__ ) try: inputformat = options.input outputformat = options.output mask = options.mask minqual = int(options.quality) qtype = options.type speciesAndLens = options.list inputfile = args[0] outputfile = args[1] except: doc_optparse.exception() outstream = open( outputfile, "w" ) instream = open( inputfile, "r" ) qualfiles = {} # read lens specieslist = speciesAndLens.split(":") species_to_lengths = {} for entry in specieslist: fields = entry.split(",") lenstream = fileinput.FileInput( fields[1] ) lendict = dict() for line in lenstream: region = line.split() lendict[region[0]] = int(region[1]) species_to_lengths[fields[0]] = lendict if len(fields) >= 3: qualfiles[fields[0]] = fields[2] specieslist = map( lambda(a): a.split(":")[0], specieslist ) # open quality binned_arrays reader = None writer = None if inputformat == "axt": # load axt if len(specieslist) != 2: print "AXT is pairwise only." sys.exit() reader = bx.align.axt.Reader(instream, species1=specieslist[0], \ species2=specieslist[1], \ species_to_lengths = species_to_lengths) elif outputformat == "maf": # load maf reader = bx.align.maf.Reader(instream, species_to_lengths=species_to_lengths) if outputformat == "axt": # setup axt if len(specieslist) != 2: print "AXT is pairwise only." sys.exit() writer = bx.align.axt.Writer(outstream, attributes=reader.attributes) elif outputformat == "maf": # setup maf writer = bx.align.maf.Writer(outstream, attributes=reader.attributes) qualfilter = Simple( mask=mask, qualspecies = species_to_lengths, \ qualfiles = qualfiles, minqual = minqual, cache=50 ) qualfilter.run( reader, writer.write ) print "For "+str(qualfilter.total)+" base pairs, "+str(qualfilter.masked)+" base pairs were masked." print str(float(qualfilter.masked)/float(qualfilter.total) * 100)+"%"
def main(): distance = 0 minregions = 2 output = 1 options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) if options.distance: distance = int( options.distance ) if options.overlap: distance = -1 * int( options.overlap ) if options.output: output = int( options.output ) if options.minregions: minregions = int( options.minregions ) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) # Get the cluster tree try: clusters, extra = find_clusters( g1, mincols=distance, minregions=minregions) except ParseError as exc: fail( "Invalid file format: %s" % str( exc ) ) f1 = open( in_fname, "r" ) out_file = open( out_fname, "w" ) # If "merge" if output == 1: fields = ["." for x in range(max(g1.chrom_col, g1.start_col, g1.end_col) + 1)] for chrom, tree in clusters.items(): for start, end, lines in tree.getregions(): fields[g1.chrom_col] = chrom fields[g1.start_col] = str(start) fields[g1.end_col] = str(end) out_file.write( "%s\n" % "\t".join( fields ) ) # If "filtered" we preserve order of file and comments, etc. if output == 2: linenums = dict() for chrom, tree in clusters.items(): for linenum in tree.getlines(): linenums[linenum] = 0 linenum = -1 f1.seek(0) for line in f1.readlines(): linenum += 1 if linenum in linenums or linenum in extra: out_file.write( "%s\n" % line.rstrip( "\n\r" ) ) # If "clustered" we output original intervals, but near each other (i.e. clustered) if output == 3: linenums = list() f1.seek(0) fileLines = f1.readlines() for chrom, tree in clusters.items(): for linenum in tree.getlines(): out_file.write( "%s\n" % fileLines[linenum].rstrip( "\n\r" ) ) # If "minimum" we output the smallest interval in each cluster if output == 4 or output == 5: linenums = list() f1.seek(0) fileLines = f1.readlines() for chrom, tree in clusters.items(): for start, end, lines in tree.getregions(): outsize = -1 outinterval = None for line in lines: # three nested for loops? # should only execute this code once per line fileline = fileLines[line].rstrip("\n\r") try: cluster_interval = GenomicInterval( g1, fileline.split("\t"), g1.chrom_col, g1.start_col, g1.end_col, g1.strand_col, g1.default_strand, g1.fix_strand ) except Exception as exc: print(str( exc ), file=sys.stderr) f1.close() sys.exit() interval_size = cluster_interval.end - cluster_interval.start if outsize == -1 or \ ( outsize > interval_size and output == 4 ) or \ ( outsize < interval_size and output == 5 ): outinterval = cluster_interval outsize = interval_size out_file.write( "%s\n" % outinterval ) f1.close() out_file.close() if g1.skipped > 0: print(skipped( g1, filedesc="" ))
def main(): options, args = doc_optparse.parse(__doc__) try: inputformat = options.input outputformat = options.output mask = options.mask minqual = int(options.quality) qtype = options.type speciesAndLens = options.list inputfile = args[0] outputfile = args[1] except: doc_optparse.exception() outstream = open(outputfile, "w") instream = open(inputfile, "r") qualfiles = {} # read lens specieslist = speciesAndLens.split(":") species_to_lengths = {} for entry in specieslist: fields = entry.split(",") lenstream = fileinput.FileInput(fields[1]) lendict = dict() for line in lenstream: region = line.split() lendict[region[0]] = int(region[1]) species_to_lengths[fields[0]] = lendict if len(fields) >= 3: qualfiles[fields[0]] = fields[2] specieslist = map(lambda (a): a.split(":")[0], specieslist) # open quality binned_arrays reader = None writer = None if inputformat == "axt": # load axt if len(specieslist) != 2: print "AXT is pairwise only." sys.exit() reader = bx.align.axt.Reader(instream, species1=specieslist[0], \ species2=specieslist[1], \ species_to_lengths = species_to_lengths) elif outputformat == "maf": # load maf reader = bx.align.maf.Reader(instream, species_to_lengths=species_to_lengths) if outputformat == "axt": # setup axt if len(specieslist) != 2: print "AXT is pairwise only." sys.exit() writer = bx.align.axt.Writer(outstream, attributes=reader.attributes) elif outputformat == "maf": # setup maf writer = bx.align.maf.Writer(outstream, attributes=reader.attributes) qualfilter = Simple( mask=mask, qualspecies = species_to_lengths, \ qualfiles = qualfiles, minqual = minqual, cache=50 ) qualfilter.run(reader, writer.write) print "For " + str(qualfilter.total) + " base pairs, " + str( qualfilter.masked) + " base pairs were masked." print str(float(qualfilter.masked) / float(qualfilter.total) * 100) + "%"
def __main__(): # # Parse options, args. # options, args = doc_optparse.parse(__doc__) try: if len(options.cols.split(',')) == 5: # BED file chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg( options.cols) else: # gff file chrom_col, start_col, end_col, strand_col = parse_cols_arg( options.cols) name_col = False dbkey = options.dbkey output_format = options.output_format gff_format = options.gff interpret_features = options.interpret_features GALAXY_DATA_INDEX_DIR = options.GALAXY_DATA_INDEX_DIR fasta_file = options.fasta input_filename, output_filename = args except: doc_optparse.exception() includes_strand_col = strand_col >= 0 strand = None nibs = {} # # Set path to sequence data. # if fasta_file: # Need to create 2bit file from fasta file. try: seq_path = tempfile.NamedTemporaryFile(dir=".").name cmd = "faToTwoBit %s %s" % (fasta_file, seq_path) tmp_name = tempfile.NamedTemporaryFile(dir=".").name tmp_stderr = open(tmp_name, 'wb') proc = subprocess.Popen(args=cmd, shell=True, stderr=tmp_stderr.fileno()) returncode = proc.wait() tmp_stderr.close() # Get stderr, allowing for case where it's very large. tmp_stderr = open(tmp_name, 'rb') stderr = '' buffsize = 1048576 try: while True: stderr += tmp_stderr.read(buffsize) if not stderr or len(stderr) % buffsize != 0: break except OverflowError: pass tmp_stderr.close() # Error checking. if returncode != 0: raise Exception(stderr) except Exception as e: stop_err('Error running faToTwoBit. ' + str(e)) else: seq_path = check_seq_file(dbkey, GALAXY_DATA_INDEX_DIR) if not os.path.exists(seq_path): # If this occurs, we need to fix the metadata validator. stop_err( "No sequences are available for '%s', request them by reporting this error." % dbkey) # # Fetch sequences. # # Get feature's line(s). def get_lines(feature): if isinstance(feature, gff_util.GFFFeature): return feature.lines() else: return [feature.rstrip('\r\n')] skipped_lines = 0 first_invalid_line = 0 invalid_lines = [] fout = open(output_filename, "w") warnings = [] warning = '' twobitfile = None file_iterator = open(input_filename) if gff_format and interpret_features: file_iterator = gff_util.GFFReaderWrapper(file_iterator, fix_strand=False) line_count = 1 for feature in file_iterator: # Ignore comments, headers. if isinstance(feature, (Header, Comment)): line_count += 1 continue name = "" if gff_format and interpret_features: # Processing features. gff_util.convert_gff_coords_to_bed(feature) chrom = feature.chrom start = feature.start end = feature.end strand = feature.strand else: # Processing lines, either interval or GFF format. line = feature.rstrip('\r\n') if line and not line.startswith("#"): fields = line.split('\t') try: chrom = fields[chrom_col] start = int(fields[start_col]) end = int(fields[end_col]) if name_col: name = fields[name_col] if gff_format: start, end = gff_util.convert_gff_coords_to_bed( [start, end]) if includes_strand_col: strand = fields[strand_col] except: warning = "Invalid chrom, start or end column values. " warnings.append(warning) if not invalid_lines: invalid_lines = get_lines(feature) first_invalid_line = line_count skipped_lines += len(invalid_lines) continue if start > end: warning = "Invalid interval, start '%d' > end '%d'. " % ( start, end) warnings.append(warning) if not invalid_lines: invalid_lines = get_lines(feature) first_invalid_line = line_count skipped_lines += len(invalid_lines) continue if strand not in ['+', '-']: strand = '+' sequence = '' else: continue # Open sequence file and get sequence for feature/interval. if seq_path and os.path.exists("%s/%s.nib" % (seq_path, chrom)): # TODO: improve support for GFF-nib interaction. if chrom in nibs: nib = nibs[chrom] else: nibs[chrom] = nib = bx.seq.nib.NibFile( open("%s/%s.nib" % (seq_path, chrom))) try: sequence = nib.get(start, end - start) except Exception as e: warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % ( start, end - start, dbkey) warnings.append(warning) if not invalid_lines: invalid_lines = get_lines(feature) first_invalid_line = line_count skipped_lines += len(invalid_lines) continue elif seq_path and os.path.isfile(seq_path): if not (twobitfile): twobitfile = bx.seq.twobit.TwoBitFile(open(seq_path)) try: if options.gff and interpret_features: # Create sequence from intervals within a feature. sequence = '' for interval in feature.intervals: sequence += twobitfile[ interval.chrom][interval.start:interval.end] else: sequence = twobitfile[chrom][start:end] except: warning = "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " % ( start, end - start, chrom) warnings.append(warning) if not invalid_lines: invalid_lines = get_lines(feature) first_invalid_line = line_count skipped_lines += len(invalid_lines) continue else: warning = "Chromosome by name '%s' was not found for build '%s'. " % ( chrom, dbkey) warnings.append(warning) if not invalid_lines: invalid_lines = get_lines(feature) first_invalid_line = line_count skipped_lines += len(invalid_lines) continue if sequence == '': warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " % \ ( chrom, start, end, dbkey ) warnings.append(warning) if not invalid_lines: invalid_lines = get_lines(feature) first_invalid_line = line_count skipped_lines += len(invalid_lines) continue if includes_strand_col and strand == "-": sequence = reverse_complement(sequence) if output_format == "fasta": l = len(sequence) c = 0 if gff_format: start, end = gff_util.convert_bed_coords_to_gff([start, end]) fields = [dbkey, str(chrom), str(start), str(end), strand] meta_data = "_".join(fields) if name.strip(): fout.write(">%s %s\n" % (meta_data, name)) else: fout.write(">%s\n" % meta_data) while c < l: b = min(c + 50, l) fout.write("%s\n" % str(sequence[c:b])) c = b else: # output_format == "interval" if gff_format and interpret_features: # TODO: need better GFF Reader to capture all information needed # to produce this line. meta_data = "\t".join([ feature.chrom, "galaxy_extract_genomic_dna", "interval", str(feature.start), str(feature.end), feature.score, feature.strand, ".", gff_util.gff_attributes_to_str(feature.attributes, "GTF") ]) else: meta_data = "\t".join(fields) if gff_format: format_str = "%s seq \"%s\";\n" else: format_str = "%s\t%s\n" fout.write(format_str % (meta_data, str(sequence))) # Update line count. if isinstance(feature, gff_util.GFFFeature): line_count += len(feature.intervals) else: line_count += 1 fout.close() if warnings: warn_msg = "%d warnings, 1st is: " % len(warnings) warn_msg += warnings[0] print warn_msg if skipped_lines: # Error message includes up to the first 10 skipped lines. print 'Skipped %d invalid lines, 1st is #%d, "%s"' % ( skipped_lines, first_invalid_line, '\n'.join(invalid_lines[:10])) # Clean up temp file. if fasta_file: os.remove(seq_path) os.remove(tmp_name)
def main(): options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2) in1_gff_format = bool(options.gff1) in2_gff_format = bool(options.gff2) in_fname, in2_fname, out_fname, direction = args except: doc_optparse.exception() # Set readers to handle either GFF or default format. if in1_gff_format: in1_reader_wrapper = GFFIntervalToBEDReaderWrapper else: in1_reader_wrapper = NiceReaderWrapper if in2_gff_format: in2_reader_wrapper = GFFIntervalToBEDReaderWrapper else: in2_reader_wrapper = NiceReaderWrapper g1 = in1_reader_wrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) g2 = in2_reader_wrapper(fileinput.FileInput(in2_fname), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True) # Find flanking features. out_file = open(out_fname, "w") try: for result in proximal_region_finder([g1, g2], direction): if type(result) is list: line, closest_feature = result # Need to join outputs differently depending on file types. if in1_gff_format: # Output is GFF with added attribute 'closest feature.' # Invervals are in BED coordinates; need to convert to GFF. line = convert_bed_coords_to_gff(line) closest_feature = convert_bed_coords_to_gff( closest_feature) # Replace double quotes with single quotes in closest feature's attributes. out_file.write( "%s closest_feature \"%s\" \n" % ( "\t".join( line.fields ), \ "\t".join( closest_feature.fields ).replace( "\"", "\\\"" ) ) ) else: # Output is BED + closest feature fields. output_line_fields = [] output_line_fields.extend(line.fields) output_line_fields.extend(closest_feature.fields) out_file.write("%s\n" % ("\t".join(output_line_fields))) else: out_file.write("%s\n" % result) except ParseError, exc: fail("Invalid file format: %s" % str(exc))
def __main__(): lflank = 0 rflank = 0 options, args = doc_optparse.parse(__doc__) try: chrom_col, start_col, end_col, strand_col = parse_cols_arg( options.cols) output_format = options.output_format seq_path = options.seq_path if (options.left_flank): lflank = int(options.left_flank) if (options.right_flank): rflank = int(options.right_flank) input_filename, output_filename = args except: doc_optparse.exception() includes_strand_col = strand_col >= 0 strand = None nibs = {} twobits = {} if not os.path.exists(seq_path): # If this occurs, we need to fix the metadata validator. print "No sequences are available for '%s', request them by reporting this error." skipped_lines = 0 first_invalid_line = 0 invalid_line = '' fout = open(output_filename, "w") warnings = [] warning = '' twobitfile = None dbkey = seq_path for i, line in enumerate(open(input_filename)): line = line.rstrip('\r\n') if line and not line.startswith("#"): fields = line.split('\t') try: chrom = fields[chrom_col] ostart = int(fields[start_col]) oend = int(fields[end_col]) start = ostart - lflank end = oend + rflank if includes_strand_col: strand = fields[strand_col] except: warning = "Invalid chrom, start or end column values. " warnings.append(warning) skipped_lines += 1 if not invalid_line: first_invalid_line = i + 1 invalid_line = line continue if start > end: warning = "Invalid interval, start '%d' > end '%d'. " % ( start, end) warnings.append(warning) skipped_lines += 1 if not invalid_line: first_invalid_line = i + 1 invalid_line = line continue if strand not in ['+', '-']: strand = '+' sequence = '' if seq_path and os.path.exists("%s/%s.nib" % (seq_path, chrom)): if chrom in nibs: nib = nibs[chrom] else: nibs[chrom] = nib = bx.seq.nib.NibFile( file("%s/%s.nib" % (seq_path, chrom))) try: sequence = nib.get(start, end - start) except: warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % ( start, end - start, dbkey) warnings.append(warning) skipped_lines += 1 if not invalid_line: first_invalid_line = i + 1 invalid_line = line continue elif seq_path and os.path.isfile(seq_path): if not (twobitfile): twobitfile = bx.seq.twobit.TwoBitFile(file(seq_path)) try: sequence = twobitfile[chrom][start:end] except: warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % ( start, end - start, dbkey) warnings.append(warning) skipped_lines += 1 if not invalid_line: first_invalid_line = i + 1 invalid_line = line continue else: warning = "Chromosome by name '%s' was not found for build '%s'. " % ( chrom, dbkey) warnings.append(warning) skipped_lines += 1 if not invalid_line: first_invalid_line = i + 1 invalid_line = line continue if sequence == '': warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " % ( chrom, start, end, dbkey) warnings.append(warning) skipped_lines += 1 if not invalid_line: first_invalid_line = i + 1 invalid_line = line continue if includes_strand_col and strand == "-": sequence = reverse_complement(sequence) sequence = sequence[0:lflank].lower( ) + sequence[lflank:len(sequence) - rflank + 1].upper() + sequence[len(sequence) - rflank + 1:len(sequence)].lower() if output_format == "fasta": l = len(sequence) c = 0 fields = [dbkey, str(chrom), str(ostart), str(oend), strand] meta_data = "_".join(fields) fout.write(">%s\n" % meta_data) while c < l: b = min(c + 50, l) fout.write("%s\n" % str(sequence[c:b])) c = b else: # output_format == "interval" meta_data = "\t".join(fields) fout.write("%s\t%s\n" % (meta_data, str(sequence))) fout.close() if warnings: warn_msg = "%d warnings, 1st is: " % len(warnings) warn_msg += warnings[0] print warn_msg if skipped_lines: print 'Skipped %d invalid lines, 1st is #%d, "%s"' % ( skipped_lines, first_invalid_line, invalid_line)
for position in posgen: kwargs['position'] = position self.out_stream.write(self.template % kwargs) def close(self): self.out_stream.flush() self.out_stream.close() if __name__ == "__main__": options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = [int(x) - 1 for x in options.cols1.split(',')] chr_col_2, position_col_2, forward_col_2, reverse_col_2 = [int(x) - 1 for x in options.cols2.split(',')] in_fname, out_fname = args except: doc_optparse.exception() # Sort through a tempfile first temp_file = tempfile.NamedTemporaryFile(mode="r") environ['LC_ALL'] = 'POSIX' commandline = "sort -f -n -k %d -k %d -k %d -o %s %s" % (chr_col_1 + 1, start_col_1 + 1, end_col_1 + 1, temp_file.name, in_fname) errorcode, stdout = commands.getstatusoutput(commandline) coverage = CoverageWriter( out_stream=open(out_fname, "a"), chromCol=chr_col_2, positionCol=position_col_2, forwardCol=forward_col_2, reverseCol=reverse_col_2, ) temp_file.seek(0) interval = io.NiceReaderWrapper( temp_file, chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1,
def main(): # Parsing Command Line here options, args = doc_optparse.parse(__doc__) try: inp1_file, inp2_file, begin_col, end_col, out_file = args except: doc_optparse.exception() begin_col = begin_col.strip() end_col = end_col.strip() if begin_col != 'None' or end_col != 'None': """ The user selected columns for restriction. We'll allow default values for both begin_col and end_col as long as the user selected at least one of them for restriction. """ if begin_col == 'None': begin_col = end_col elif end_col == 'None': end_col = begin_col begin_col = int(begin_col) end_col = int(end_col) """Make sure that begin_col <= end_col (switch if not)""" if begin_col > end_col: tmp_col = end_col end_col = begin_col begin_col = tmp_col else: begin_col = end_col = '' try: fo = open(out_file, 'w') except: print >> sys.stderr, "Unable to open output file" sys.exit() """ len1 is the number of lines in inp1_file lines1 is the set of unique lines in inp1_file diff1 is the number of duplicate lines removed from inp1_file """ len1, lines1 = get_lines(inp1_file, begin_col, end_col, options.ignore_empty_end_cols) diff1 = len1 - len(lines1) len2, lines2 = get_lines(inp2_file, begin_col, end_col, options.ignore_empty_end_cols) lines1.difference_update(lines2) """lines1 is now the set of unique lines in inp1_file - the set of unique lines in inp2_file""" for line in lines1: print >> fo, line fo.close() info_msg = 'Subtracted %d lines. ' % ((len1 - diff1) - len(lines1)) if begin_col and end_col: info_msg += 'Restricted to columns c' + str( begin_col) + ' thru c' + str(end_col) + '. ' if diff1 > 0: info_msg += 'Eliminated %d duplicate/blank/comment/invalid lines from first query.' % diff1 print info_msg
def __main__(): options, args = doc_optparse.parse( __doc__ ) try: chrom_col, start_col, end_col, strand_col = parse_cols_arg( options.cols ) dbkey = options.dbkey output_format = options.output_format GALAXY_DATA_INDEX_DIR = options.GALAXY_DATA_INDEX_DIR input_filename, output_filename = args except: doc_optparse.exception() includes_strand_col = strand_col >= 0 strand = None nibs = {} twobits = {} seq_path = check_seq_file( dbkey, GALAXY_DATA_INDEX_DIR ) if not os.path.exists( seq_path ): # If this occurs, we need to fix the metadata validator. stop_err( "No sequences are available for '%s', request them by reporting this error." % dbkey ) skipped_lines = 0 first_invalid_line = 0 invalid_line = '' fout = open( output_filename, "w" ) warnings = [] warning = '' twobitfile = None for i, line in enumerate( open( input_filename ) ): line = line.rstrip( '\r\n' ) if line and not line.startswith( "#" ): fields = line.split( '\t' ) try: chrom = fields[chrom_col] start = int( fields[start_col] ) end = int( fields[end_col] ) if includes_strand_col: strand = fields[strand_col] except: warning = "Invalid chrom, start or end column values. " warnings.append( warning ) skipped_lines += 1 if not invalid_line: first_invalid_line = i + 1 invalid_line = line continue if start > end: warning = "Invalid interval, start '%d' > end '%d'. " % ( start, end ) warnings.append( warning ) skipped_lines += 1 if not invalid_line: first_invalid_line = i + 1 invalid_line = line continue if strand not in ['+', '-']: strand = '+' sequence = '' if seq_path and os.path.exists( "%s/%s.nib" % ( seq_path, chrom ) ): if chrom in nibs: nib = nibs[chrom] else: nibs[chrom] = nib = bx.seq.nib.NibFile( file( "%s/%s.nib" % ( seq_path, chrom ) ) ) try: sequence = nib.get( start, end-start ) except: warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " %( start, end-start, dbkey ) warnings.append( warning ) skipped_lines += 1 if not invalid_line: first_invalid_line = i + 1 invalid_line = line continue elif seq_path and os.path.isfile( seq_path ): if not(twobitfile): twobitfile = bx.seq.twobit.TwoBitFile( file( seq_path ) ) try: sequence = twobitfile[chrom][start:end] except: warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " %( start, end-start, dbkey ) warnings.append( warning ) skipped_lines += 1 if not invalid_line: first_invalid_line = i + 1 invalid_line = line continue else: warning = "Chromosome by name '%s' was not found for build '%s'. " % ( chrom, dbkey ) warnings.append( warning ) skipped_lines += 1 if not invalid_line: first_invalid_line = i + 1 invalid_line = line continue if sequence == '': warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " %( chrom, start, end, dbkey ) warnings.append( warning ) skipped_lines += 1 if not invalid_line: first_invalid_line = i + 1 invalid_line = line continue if includes_strand_col and strand == "-": sequence = reverse_complement( sequence ) if output_format == "fasta" : l = len( sequence ) c = 0 fields = [dbkey, str( chrom ), str( start ), str( end ), strand] meta_data = "_".join( fields ) fout.write( ">%s\n" % meta_data ) while c < l: b = min( c + 50, l ) fout.write( "%s\n" % str( sequence[c:b] ) ) c = b else: # output_format == "interval" meta_data = "\t".join( fields ) fout.write( "%s\t%s\n" % ( meta_data, str( sequence ) ) ) fout.close() if warnings: warn_msg = "%d warnings, 1st is: " % len( warnings ) warn_msg += warnings[0] print warn_msg if skipped_lines: print 'Skipped %d invalid lines, 1st is #%d, "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
def main(): options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) in1_gff_format = bool( options.gff1 ) in2_gff_format = bool( options.gff2 ) in_fname, in2_fname, out_fname, direction = args except: doc_optparse.exception() # Set readers to handle either GFF or default format. if in1_gff_format: in1_reader_wrapper = GFFIntervalToBEDReaderWrapper else: in1_reader_wrapper = NiceReaderWrapper if in2_gff_format: in2_reader_wrapper = GFFIntervalToBEDReaderWrapper else: in2_reader_wrapper = NiceReaderWrapper g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True ) # Find flanking features. out_file = open( out_fname, "w" ) try: for result in proximal_region_finder([g1, g2], direction): if type( result ) is list: line, closest_feature = result # Need to join outputs differently depending on file types. if in1_gff_format: # Output is GFF with added attribute 'closest feature.' # Invervals are in BED coordinates; need to convert to GFF. line = convert_bed_coords_to_gff( line ) closest_feature = convert_bed_coords_to_gff( closest_feature ) # Replace double quotes with single quotes in closest feature's attributes. out_file.write( "%s closest_feature \"%s\" \n" % ( "\t".join( line.fields ), "\t".join( closest_feature.fields ).replace( "\"", "\\\"" ) ) ) else: # Output is BED + closest feature fields. output_line_fields = [] output_line_fields.extend( line.fields ) output_line_fields.extend( closest_feature.fields ) out_file.write( "%s\n" % ( "\t".join( output_line_fields ) ) ) else: out_file.write( "%s\n" % result ) except ParseError as exc: fail( "Invalid file format: %s" % str( exc ) ) print("Direction: %s" % (direction)) if g1.skipped > 0: print(skipped( g1, filedesc=" of 1st dataset" )) if g2.skipped > 0: print(skipped( g2, filedesc=" of 2nd dataset" ))
def __main__(): # # Parse options, args. # options, args = doc_optparse.parse( __doc__ ) try: if len(options.cols.split(',')) == 5: # BED file chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg( options.cols ) else: # gff file chrom_col, start_col, end_col, strand_col = parse_cols_arg( options.cols ) name_col = False dbkey = options.dbkey output_format = options.output_format gff_format = options.gff interpret_features = options.interpret_features GALAXY_DATA_INDEX_DIR = options.GALAXY_DATA_INDEX_DIR fasta_file = options.fasta input_filename, output_filename = args except: doc_optparse.exception() includes_strand_col = strand_col >= 0 strand = None nibs = {} # # Set path to sequence data. # if fasta_file: # Need to create 2bit file from fasta file. try: seq_path = tempfile.NamedTemporaryFile( dir="." ).name cmd = "faToTwoBit %s %s" % ( fasta_file, seq_path ) tmp_name = tempfile.NamedTemporaryFile( dir="." ).name tmp_stderr = open( tmp_name, 'wb' ) proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() ) returncode = proc.wait() tmp_stderr.close() # Get stderr, allowing for case where it's very large. tmp_stderr = open( tmp_name, 'rb' ) stderr = '' buffsize = 1048576 try: while True: stderr += tmp_stderr.read( buffsize ) if not stderr or len( stderr ) % buffsize != 0: break except OverflowError: pass tmp_stderr.close() # Error checking. if returncode != 0: raise Exception(stderr) except Exception as e: stop_err( 'Error running faToTwoBit. ' + str( e ) ) else: seq_path = check_seq_file( dbkey, GALAXY_DATA_INDEX_DIR ) if not os.path.exists( seq_path ): # If this occurs, we need to fix the metadata validator. stop_err( "No sequences are available for '%s', request them by reporting this error." % dbkey ) # # Fetch sequences. # # Get feature's line(s). def get_lines( feature ): if isinstance( feature, gff_util.GFFFeature ): return feature.lines() else: return [ feature.rstrip( '\r\n' ) ] skipped_lines = 0 first_invalid_line = 0 invalid_lines = [] fout = open( output_filename, "w" ) warnings = [] warning = '' twobitfile = None file_iterator = open( input_filename ) if gff_format and interpret_features: file_iterator = gff_util.GFFReaderWrapper( file_iterator, fix_strand=False ) line_count = 1 for feature in file_iterator: # Ignore comments, headers. if isinstance( feature, ( Header, Comment ) ): line_count += 1 continue name = "" if gff_format and interpret_features: # Processing features. gff_util.convert_gff_coords_to_bed( feature ) chrom = feature.chrom start = feature.start end = feature.end strand = feature.strand else: # Processing lines, either interval or GFF format. line = feature.rstrip( '\r\n' ) if line and not line.startswith( "#" ): fields = line.split( '\t' ) try: chrom = fields[chrom_col] start = int( fields[start_col] ) end = int( fields[end_col] ) if name_col: name = fields[name_col] if gff_format: start, end = gff_util.convert_gff_coords_to_bed( [start, end] ) if includes_strand_col: strand = fields[strand_col] except: warning = "Invalid chrom, start or end column values. " warnings.append( warning ) if not invalid_lines: invalid_lines = get_lines( feature ) first_invalid_line = line_count skipped_lines += len( invalid_lines ) continue if start > end: warning = "Invalid interval, start '%d' > end '%d'. " % ( start, end ) warnings.append( warning ) if not invalid_lines: invalid_lines = get_lines( feature ) first_invalid_line = line_count skipped_lines += len( invalid_lines ) continue if strand not in ['+', '-']: strand = '+' sequence = '' else: continue # Open sequence file and get sequence for feature/interval. if seq_path and os.path.exists( "%s/%s.nib" % ( seq_path, chrom ) ): # TODO: improve support for GFF-nib interaction. if chrom in nibs: nib = nibs[chrom] else: nibs[chrom] = nib = bx.seq.nib.NibFile( open( "%s/%s.nib" % ( seq_path, chrom ) ) ) try: sequence = nib.get( start, end - start ) except Exception as e: warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % ( start, end - start, dbkey ) warnings.append( warning ) if not invalid_lines: invalid_lines = get_lines( feature ) first_invalid_line = line_count skipped_lines += len( invalid_lines ) continue elif seq_path and os.path.isfile( seq_path ): if not(twobitfile): twobitfile = bx.seq.twobit.TwoBitFile( open( seq_path ) ) try: if options.gff and interpret_features: # Create sequence from intervals within a feature. sequence = '' for interval in feature.intervals: sequence += twobitfile[interval.chrom][interval.start:interval.end] else: sequence = twobitfile[chrom][start:end] except: warning = "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " % ( start, end - start, chrom ) warnings.append( warning ) if not invalid_lines: invalid_lines = get_lines( feature ) first_invalid_line = line_count skipped_lines += len( invalid_lines ) continue else: warning = "Chromosome by name '%s' was not found for build '%s'. " % ( chrom, dbkey ) warnings.append( warning ) if not invalid_lines: invalid_lines = get_lines( feature ) first_invalid_line = line_count skipped_lines += len( invalid_lines ) continue if sequence == '': warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " % \ ( chrom, start, end, dbkey ) warnings.append( warning ) if not invalid_lines: invalid_lines = get_lines( feature ) first_invalid_line = line_count skipped_lines += len( invalid_lines ) continue if includes_strand_col and strand == "-": sequence = reverse_complement( sequence ) if output_format == "fasta": l = len( sequence ) c = 0 if gff_format: start, end = gff_util.convert_bed_coords_to_gff( [ start, end ] ) fields = [dbkey, str( chrom ), str( start ), str( end ), strand] meta_data = "_".join( fields ) if name.strip(): fout.write( ">%s %s\n" % (meta_data, name) ) else: fout.write( ">%s\n" % meta_data ) while c < l: b = min( c + 50, l ) fout.write( "%s\n" % str( sequence[c:b] ) ) c = b else: # output_format == "interval" if gff_format and interpret_features: # TODO: need better GFF Reader to capture all information needed # to produce this line. meta_data = "\t".join( [feature.chrom, "galaxy_extract_genomic_dna", "interval", str( feature.start ), str( feature.end ), feature.score, feature.strand, ".", gff_util.gff_attributes_to_str( feature.attributes, "GTF" ) ] ) else: meta_data = "\t".join( fields ) if gff_format: format_str = "%s seq \"%s\";\n" else: format_str = "%s\t%s\n" fout.write( format_str % ( meta_data, str( sequence ) ) ) # Update line count. if isinstance( feature, gff_util.GFFFeature ): line_count += len( feature.intervals ) else: line_count += 1 fout.close() if warnings: warn_msg = "%d warnings, 1st is: " % len( warnings ) warn_msg += warnings[0] print(warn_msg) if skipped_lines: # Error message includes up to the first 10 skipped lines. print('Skipped %d invalid lines, 1st is #%d, "%s"' % ( skipped_lines, first_invalid_line, '\n'.join( invalid_lines[:10] ) )) # Clean up temp file. if fasta_file: os.remove( seq_path ) os.remove( tmp_name )
self.out_stream.flush() self.out_stream.close() if __name__ == "__main__": options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = [ int(x) - 1 for x in options.cols1.split(',') ] chr_col_2, position_col_2, forward_col_2, reverse_col_2 = [ int(x) - 1 for x in options.cols2.split(',') ] in_fname, out_fname = args except: doc_optparse.exception() # Sort through a tempfile first temp_file = tempfile.NamedTemporaryFile(mode="r") environ['LC_ALL'] = 'POSIX' commandline = "sort -f -n -k %d -k %d -k %d -o %s %s" % ( chr_col_1 + 1, start_col_1 + 1, end_col_1 + 1, temp_file.name, in_fname) errorcode, stdout = commands.getstatusoutput(commandline) coverage = CoverageWriter( out_stream=open(out_fname, "a"), chromCol=chr_col_2, positionCol=position_col_2, forwardCol=forward_col_2, reverseCol=reverse_col_2,
def main(): # Parse command line options, args = doc_optparse.parse( __doc__ ) if options.version: return try: wiggle_file = args[0] # If it appears to be a bz2 file, attempt to open with table if wiggle_file.endswith( ".bz2" ): table_file = wiggle_file + "t" if not os.path.exists( table_file ): doc_optparse.exit( "To index bz2 compressed files first " "create a bz2t file with bzip-table." ) # Open with SeekableBzip2File so we have tell support wiggle_in = SeekableBzip2File( wiggle_file, table_file ) # Strip .bz2 from the filename before adding ".index" wiggle_file = wiggle_file[:-4] elif wiggle_file.endswith( ".lzo" ): from bx.misc.seeklzop import SeekableLzopFile table_file = wiggle_file + "t" if not os.path.exists( table_file ): doc_optparse.exit( "To index lzo compressed files first " "create a lzot file with bzip-table." ) # Open with SeekableBzip2File so we have tell support wiggle_in = SeekableLzopFile( wiggle_file, table_file ) # Strip .lzo from the filename before adding ".index" wiggle_file = wiggle_file[:-4] else: wiggle_in = open( wiggle_file ) # Determine the name of the index file if len( args ) > 1: index_file = args[1] else: index_file = wiggle_file + ".index" except: doc_optparse.exception() indexes = interval_index_file.Indexes() # Can't use the iterator, as there is no next() and thus # no way to access the positions. The following code is # modified from wiggle.py last_chrom = None start = None end = None first_pos = None # always for wiggle data strand = '+' mode = "bed" while 1: pos = wiggle_in.tell() line = wiggle_in.readline() if not line: break if line.isspace() or line.startswith( "track" ) or line.startswith( "#" ) or line.startswith( "browser" ): continue elif line.startswith( "bed" ): indexes.add( fields[0], int( fields[1] ), int( fields[2] ), pos ) elif line.startswith( "variableStep" ) or line.startswith( "fixedStep"): if first_pos != None: indexes.add( last_chrom, start, end, first_pos ) first_pos = pos header = bx.wiggle.parse_header( line ) last_chrom = header['chrom'] start = int(header['start']) - 1 end = start current_step = None if 'span' in header: current_span = int( header['span'] ) else: current_span = 1 if 'step' in header: current_step = int( header['step'] ) if line.startswith( "variableStep" ): mode = "variableStep" else: mode = "fixedStep" elif mode == "variableStep": fields = line.split() end = int( fields[0] ) - 1 + current_span elif mode == "fixedStep": end += current_step else: raise "Unexpected input line: %s" % line.strip() out = open( index_file, 'w' ) indexes.write( out ) out.close()
def main(): allchroms = False upstream_pad = 0 downstream_pad = 0 options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) lengths = options.lengths if options.all: allchroms = True in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) lens = dict() chroms = list() # dbfile is used to determine the length of each chromosome. The lengths # are added to the lens dict and passed copmlement operation code in bx. dbfile = fileinput.FileInput( lengths ) if dbfile: if not allchroms: try: for line in dbfile: fields = line.split("\t") lens[fields[0]] = int(fields[1]) except: # assume LEN doesn't exist or is corrupt somehow pass elif allchroms: try: for line in dbfile: fields = line.split("\t") end = int(fields[1]) chroms.append("\t".join([fields[0],"0",str(end)])) except: pass # Safety...if the dbfile didn't exist and we're on allchroms, then # default to generic complement if allchroms and len(chroms) == 0: allchroms = False if allchroms: chromReader = GenomicIntervalReader(chroms) generator = subtract([chromReader, g1]) else: generator = complement(g1, lens) out_file = open( out_fname, "w" ) try: for interval in generator: if type( interval ) is GenomicInterval: out_file.write( "%s\n" % "\t".join( interval ) ) else: out_file.write( "%s\n" % interval ) except ParseError, exc: out_file.close() fail( "Invalid file format: %s" % str( exc ) )
def main(): mincols = 1 leftfill = False rightfill = False options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2) if options.mincols: mincols = int(options.mincols) if options.fill: if options.fill == "both": rightfill = leftfill = True else: rightfill = options.fill == "right" leftfill = options.fill == "left" in_fname, in2_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) g2 = NiceReaderWrapper(fileinput.FileInput(in2_fname), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True) out_file = open(out_fname, "w") try: for outfields in join(g1, g2, mincols=mincols, rightfill=rightfill, leftfill=leftfill): if type(outfields) is list: out_file.write("%s\n" % "\t".join(outfields)) else: out_file.write("%s\n" % outfields) except ParseError as exc: out_file.close() fail("Invalid file format: %s" % str(exc)) except MemoryError: out_file.close() fail("Input datasets were too large to complete the join operation.") out_file.close() if g1.skipped > 0: print(skipped(g1, filedesc=" of 1st dataset")) if g2.skipped > 0: print(skipped(g2, filedesc=" of 2nd dataset"))