def main(): distance = 0 minregions = 2 output = 1 options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) if options.distance: distance = int( options.distance ) if options.overlap: distance = -1 * int( options.overlap ) if options.output: output = int( options.output ) if options.minregions: minregions = int( options.minregions ) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) # Get the cluster tree try: clusters, extra = find_clusters( g1, mincols=distance, minregions=minregions) except ParseError, exc: fail( "Invalid file format: %s" % str( exc ) )
def main(): options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2) in_fname, in2_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) g2 = NiceReaderWrapper(fileinput.FileInput(in2_fname), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True) out_file = open(out_fname, "w") try: for line in coverage([g1, g2]): if type(line) is GenomicInterval: out_file.write("%s\n" % "\t".join(line.fields)) else: out_file.write("%s\n" % line) except ParseError, exc: out_file.close() fail("Invalid file format: %s" % str(exc))
def main(): mincols = 1 options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) if options.mincols: mincols = int( options.mincols ) pieces = bool( options.pieces ) in1_gff_format = bool( options.gff1 ) in2_gff_format = bool( options.gff2 ) in_fname, in2_fname, out_fname = args except: doc_optparse.exception() # Set readers to handle either GFF or default format. if in1_gff_format: in1_reader_wrapper = GFFReaderWrapper else: in1_reader_wrapper = NiceReaderWrapper if in2_gff_format: in2_reader_wrapper = GFFReaderWrapper else: in2_reader_wrapper = NiceReaderWrapper g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) if in1_gff_format: # Intersect requires coordinates in BED format. g1.convert_to_bed_coord = True g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True ) if in2_gff_format: # Intersect requires coordinates in BED format. g2.convert_to_bed_coord = True out_file = open( out_fname, "w" ) try: for feature in intersect( [g1, g2], pieces=pieces, mincols=mincols ): if isinstance( feature, GFFFeature ): # Convert back to GFF coordinates since reader converted automatically. convert_bed_coords_to_gff( feature ) for interval in feature.intervals: out_file.write( "%s\n" % "\t".join( interval.fields ) ) elif isinstance( feature, GenomicInterval ): out_file.write( "%s\n" % "\t".join( feature.fields ) ) else: out_file.write( "%s\n" % feature ) except ParseError, e: out_file.close() fail( "Invalid file format: %s" % str( e ) )
def main(): distance = 0 minregions = 2 output = 1 options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) if options.distance: distance = int(options.distance) if options.overlap: distance = -1 * int(options.overlap) if options.output: output = int(options.output) if options.minregions: minregions = int(options.minregions) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) # Get the cluster tree try: clusters, extra = find_clusters(g1, mincols=distance, minregions=minregions) except ParseError, exc: fail("Invalid file format: %s" % str(exc))
def main(): options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) in_fname, in2_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True ) out_file = open( out_fname, "w" ) try: for line in coverage( [g1, g2] ): if type( line ) is GenomicInterval: out_file.write( "%s\n" % "\t".join( line.fields ) ) else: out_file.write( "%s\n" % line ) except ParseError, exc: out_file.close() fail( "Invalid file format: %s" % str( exc ) )
def subtract_files(cls, fn1, fn2, out_fn): g1 = NiceReaderWrapper(fileinput.FileInput(fn1), fix_strand=True) g2 = NiceReaderWrapper(fileinput.FileInput(fn2), fix_strand=True) out_file = open(out_fn, "w") try: for feature in subtract([g1, g2], pieces=True, mincols=1): out_file.write("%s\n" % feature) except ParseError, exc: out_file.close() fail("Invalid file format: %s" % str(exc))
def main(): mincols = 1 leftfill = False rightfill = False options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2) if options.mincols: mincols = int(options.mincols) if options.fill: if options.fill == "both": rightfill = leftfill = True else: rightfill = options.fill == "right" leftfill = options.fill == "left" in_fname, in2_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) g2 = NiceReaderWrapper(fileinput.FileInput(in2_fname), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True) out_file = open(out_fname, "w") try: for outfields in join(g1, g2, mincols=mincols, rightfill=rightfill, leftfill=leftfill): if type(outfields) is list: out_file.write("%s\n" % "\t".join(outfields)) else: out_file.write("%s\n" % outfields) except ParseError, exc: out_file.close() fail("Invalid file format: %s" % str(exc))
def main(): mincols = 1 options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) if options.mincols: mincols = int(options.mincols) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) out_file = open(out_fname, "w") try: for line in merge(g1, mincols=mincols): if options.threecol: if type(line) is GenomicInterval: out_file.write( "%s\t%s\t%s\n" % (line.chrom, str(line.startCol), str(line.endCol))) elif type(line) is list: out_file.write("%s\t%s\t%s\n" % (line[chr_col_1], str(line[start_col_1]), str(line[end_col_1]))) else: out_file.write("%s\n" % line) else: if type(line) is GenomicInterval: out_file.write("%s\n" % "\t".join(line.fields)) elif type(line) is list: out_file.write("%s\n" % "\t".join(line)) else: out_file.write("%s\n" % line) except ParseError as exc: out_file.close() fail("Invalid file format: %s" % str(exc)) out_file.close() if g1.skipped > 0: print(skipped(g1, filedesc=" of 1st dataset"))
def main(): sameformat = False options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) if options.sameformat: sameformat = True in_file_1, in_file_2, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_file_1 ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) g2 = NiceReaderWrapper( fileinput.FileInput( in_file_2 ), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True ) out_file = open( out_fname, "w" ) try: for line in concat( [g1, g2], sameformat=sameformat ): if type( line ) is GenomicInterval: out_file.write( "%s\n" % "\t".join( line.fields ) ) else: out_file.write( "%s\n" % line ) except ParseError as exc: out_file.close() fail( "Invalid file format: %s" % str( exc ) ) out_file.close() if g1.skipped > 0: print(skipped( g1, filedesc=" of 1st dataset" )) if g2.skipped > 0: print(skipped( g2, filedesc=" of 2nd dataset" ))
def main(): mincols = 1 leftfill = False rightfill = False options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) if options.mincols: mincols = int( options.mincols ) if options.fill: if options.fill == "both": rightfill = leftfill = True else: rightfill = options.fill == "right" leftfill = options.fill == "left" in_fname, in2_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True ) out_file = open( out_fname, "w" ) try: for outfields in join(g1, g2, mincols=mincols, rightfill=rightfill, leftfill=leftfill): if type( outfields ) is list: out_file.write( "%s\n" % "\t".join( outfields ) ) else: out_file.write( "%s\n" % outfields ) except ParseError, exc: out_file.close() fail( "Invalid file format: %s" % str( exc ) )
def main(): mincols = 1 options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) if options.mincols: mincols = int( options.mincols ) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) out_file = open( out_fname, "w" ) try: for line in merge(g1, mincols=mincols): if options.threecol: if type( line ) is GenomicInterval: out_file.write( "%s\t%s\t%s\n" % ( line.chrom, str( line.startCol ), str( line.endCol ) ) ) elif type( line ) is list: out_file.write( "%s\t%s\t%s\n" % ( line[chr_col_1], str( line[start_col_1] ), str( line[end_col_1] ) ) ) else: out_file.write( "%s\n" % line ) else: if type( line ) is GenomicInterval: out_file.write( "%s\n" % "\t".join( line.fields ) ) elif type( line ) is list: out_file.write( "%s\n" % "\t".join( line ) ) else: out_file.write( "%s\n" % line ) except ParseError as exc: out_file.close() fail( "Invalid file format: %s" % str( exc ) ) out_file.close() if g1.skipped > 0: print(skipped( g1, filedesc=" of 1st dataset" ))
def main(): options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) try: bases = base_coverage(g1) except ParseError, exc: fail( "Invalid file format: %s" % str( exc ) )
def main(): options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) try: bases = base_coverage(g1) except ParseError, exc: fail("Invalid file format: %s" % str(exc))
def main(): options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) try: bases = base_coverage(g1) except ParseError as exc: fail( "Invalid file format: %s" % str( exc ) ) out_file = open( out_fname, "w" ) out_file.write( "%s\n" % str( bases ) ) out_file.close() if g1.skipped > 0: print(skipped( g1, filedesc="" ))
def main(): distance = 0 minregions = 2 output = 1 options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) if options.distance: distance = int( options.distance ) if options.overlap: distance = -1 * int( options.overlap ) if options.output: output = int( options.output ) if options.minregions: minregions = int( options.minregions ) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) # Get the cluster tree try: clusters, extra = find_clusters( g1, mincols=distance, minregions=minregions) except ParseError as exc: fail( "Invalid file format: %s" % str( exc ) ) f1 = open( in_fname, "r" ) out_file = open( out_fname, "w" ) # If "merge" if output == 1: fields = ["." for x in range(max(g1.chrom_col, g1.start_col, g1.end_col) + 1)] for chrom, tree in clusters.items(): for start, end, lines in tree.getregions(): fields[g1.chrom_col] = chrom fields[g1.start_col] = str(start) fields[g1.end_col] = str(end) out_file.write( "%s\n" % "\t".join( fields ) ) # If "filtered" we preserve order of file and comments, etc. if output == 2: linenums = dict() for chrom, tree in clusters.items(): for linenum in tree.getlines(): linenums[linenum] = 0 linenum = -1 f1.seek(0) for line in f1.readlines(): linenum += 1 if linenum in linenums or linenum in extra: out_file.write( "%s\n" % line.rstrip( "\n\r" ) ) # If "clustered" we output original intervals, but near each other (i.e. clustered) if output == 3: linenums = list() f1.seek(0) fileLines = f1.readlines() for chrom, tree in clusters.items(): for linenum in tree.getlines(): out_file.write( "%s\n" % fileLines[linenum].rstrip( "\n\r" ) ) # If "minimum" we output the smallest interval in each cluster if output == 4 or output == 5: linenums = list() f1.seek(0) fileLines = f1.readlines() for chrom, tree in clusters.items(): for start, end, lines in tree.getregions(): outsize = -1 outinterval = None for line in lines: # three nested for loops? # should only execute this code once per line fileline = fileLines[line].rstrip("\n\r") try: cluster_interval = GenomicInterval( g1, fileline.split("\t"), g1.chrom_col, g1.start_col, g1.end_col, g1.strand_col, g1.default_strand, g1.fix_strand ) except Exception as exc: print(str( exc ), file=sys.stderr) f1.close() sys.exit() interval_size = cluster_interval.end - cluster_interval.start if outsize == -1 or \ ( outsize > interval_size and output == 4 ) or \ ( outsize < interval_size and output == 5 ): outinterval = cluster_interval outsize = interval_size out_file.write( "%s\n" % outinterval ) f1.close() out_file.close() if g1.skipped > 0: print(skipped( g1, filedesc="" ))
chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True ) out_file = open( out_fname, "w" ) try: for outfields in join(g1, g2, mincols=mincols, rightfill=rightfill, leftfill=leftfill): if type( outfields ) is list: out_file.write( "%s\n" % "\t".join( outfields ) ) else: out_file.write( "%s\n" % outfields ) except ParseError, exc: out_file.close() fail( "Invalid file format: %s" % str( exc ) ) except MemoryError: out_file.close() fail( "Input datasets were too large to complete the join operation." ) out_file.close() if g1.skipped > 0: print skipped( g1, filedesc=" of 1st dataset" ) if g2.skipped > 0: print skipped( g2, filedesc=" of 2nd dataset" ) if __name__ == "__main__": main()
def main(): options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) in1_gff_format = bool( options.gff1 ) in2_gff_format = bool( options.gff2 ) in_fname, in2_fname, out_fname, direction = args except: doc_optparse.exception() # Set readers to handle either GFF or default format. if in1_gff_format: in1_reader_wrapper = GFFIntervalToBEDReaderWrapper else: in1_reader_wrapper = NiceReaderWrapper if in2_gff_format: in2_reader_wrapper = GFFIntervalToBEDReaderWrapper else: in2_reader_wrapper = NiceReaderWrapper g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True ) # Find flanking features. out_file = open( out_fname, "w" ) try: for result in proximal_region_finder([g1, g2], direction): if type( result ) is list: line, closest_feature = result # Need to join outputs differently depending on file types. if in1_gff_format: # Output is GFF with added attribute 'closest feature.' # Invervals are in BED coordinates; need to convert to GFF. line = convert_bed_coords_to_gff( line ) closest_feature = convert_bed_coords_to_gff( closest_feature ) # Replace double quotes with single quotes in closest feature's attributes. out_file.write( "%s closest_feature \"%s\" \n" % ( "\t".join( line.fields ), "\t".join( closest_feature.fields ).replace( "\"", "\\\"" ) ) ) else: # Output is BED + closest feature fields. output_line_fields = [] output_line_fields.extend( line.fields ) output_line_fields.extend( closest_feature.fields ) out_file.write( "%s\n" % ( "\t".join( output_line_fields ) ) ) else: out_file.write( "%s\n" % result ) except ParseError as exc: fail( "Invalid file format: %s" % str( exc ) ) print("Direction: %s" % (direction)) if g1.skipped > 0: print(skipped( g1, filedesc=" of 1st dataset" )) if g2.skipped > 0: print(skipped( g2, filedesc=" of 2nd dataset" ))
def main(): options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2) in1_gff_format = bool(options.gff1) in2_gff_format = bool(options.gff2) in_fname, in2_fname, out_fname, direction = args except: doc_optparse.exception() # Set readers to handle either GFF or default format. if in1_gff_format: in1_reader_wrapper = GFFIntervalToBEDReaderWrapper else: in1_reader_wrapper = NiceReaderWrapper if in2_gff_format: in2_reader_wrapper = GFFIntervalToBEDReaderWrapper else: in2_reader_wrapper = NiceReaderWrapper g1 = in1_reader_wrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) g2 = in2_reader_wrapper(fileinput.FileInput(in2_fname), chrom_col=chr_col_2, start_col=start_col_2, end_col=end_col_2, strand_col=strand_col_2, fix_strand=True) # Find flanking features. out_file = open(out_fname, "w") try: for result in proximal_region_finder([g1, g2], direction): if type(result) is list: line, closest_feature = result # Need to join outputs differently depending on file types. if in1_gff_format: # Output is GFF with added attribute 'closest feature.' # Invervals are in BED coordinates; need to convert to GFF. line = convert_bed_coords_to_gff(line) closest_feature = convert_bed_coords_to_gff( closest_feature) # Replace double quotes with single quotes in closest feature's attributes. out_file.write( "%s closest_feature \"%s\" \n" % ("\t".join(line.fields), "\t".join( closest_feature.fields).replace("\"", "\\\""))) else: # Output is BED + closest feature fields. output_line_fields = [] output_line_fields.extend(line.fields) output_line_fields.extend(closest_feature.fields) out_file.write("%s\n" % ("\t".join(output_line_fields))) else: out_file.write("%s\n" % result) except ParseError as exc: fail("Invalid file format: %s" % str(exc)) print("Direction: %s" % (direction)) if g1.skipped > 0: print(skipped(g1, filedesc=" of 1st dataset")) if g2.skipped > 0: print(skipped(g2, filedesc=" of 2nd dataset"))
def main(): distance = 0 minregions = 2 output = 1 options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) if options.distance: distance = int(options.distance) if options.overlap: distance = -1 * int(options.overlap) if options.output: output = int(options.output) if options.minregions: minregions = int(options.minregions) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) # Get the cluster tree try: clusters, extra = find_clusters(g1, mincols=distance, minregions=minregions) except ParseError as exc: fail("Invalid file format: %s" % str(exc)) f1 = open(in_fname, "r") out_file = open(out_fname, "w") # If "merge" if output == 1: fields = [ "." for x in range(max(g1.chrom_col, g1.start_col, g1.end_col) + 1) ] for chrom, tree in clusters.items(): for start, end, lines in tree.getregions(): fields[g1.chrom_col] = chrom fields[g1.start_col] = str(start) fields[g1.end_col] = str(end) out_file.write("%s\n" % "\t".join(fields)) # If "filtered" we preserve order of file and comments, etc. if output == 2: linenums = dict() for chrom, tree in clusters.items(): for linenum in tree.getlines(): linenums[linenum] = 0 linenum = -1 f1.seek(0) for line in f1.readlines(): linenum += 1 if linenum in linenums or linenum in extra: out_file.write("%s\n" % line.rstrip("\n\r")) # If "clustered" we output original intervals, but near each other (i.e. clustered) if output == 3: linenums = list() f1.seek(0) fileLines = f1.readlines() for chrom, tree in clusters.items(): for linenum in tree.getlines(): out_file.write("%s\n" % fileLines[linenum].rstrip("\n\r")) # If "minimum" we output the smallest interval in each cluster if output == 4 or output == 5: linenums = list() f1.seek(0) fileLines = f1.readlines() for chrom, tree in clusters.items(): for start, end, lines in tree.getregions(): outsize = -1 outinterval = None for line in lines: # three nested for loops? # should only execute this code once per line fileline = fileLines[line].rstrip("\n\r") try: cluster_interval = GenomicInterval( g1, fileline.split("\t"), g1.chrom_col, g1.start_col, g1.end_col, g1.strand_col, g1.default_strand, g1.fix_strand) except Exception as exc: print(str(exc), file=sys.stderr) f1.close() sys.exit() interval_size = cluster_interval.end - cluster_interval.start if outsize == -1 or \ ( outsize > interval_size and output == 4 ) or \ ( outsize < interval_size and output == 5 ): outinterval = cluster_interval outsize = interval_size out_file.write("%s\n" % outinterval) f1.close() out_file.close() if g1.skipped > 0: print(skipped(g1, filedesc=""))
def main(): allchroms = False options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) lengths = options.lengths if options.all: allchroms = True in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) lens = dict() chroms = list() # dbfile is used to determine the length of each chromosome. The lengths # are added to the lens dict and passed copmlement operation code in bx. dbfile = fileinput.FileInput(lengths) if dbfile: if not allchroms: try: for line in dbfile: fields = line.split("\t") lens[fields[0]] = int(fields[1]) except: # assume LEN doesn't exist or is corrupt somehow pass elif allchroms: try: for line in dbfile: fields = line.split("\t") end = int(fields[1]) chroms.append("\t".join([fields[0], "0", str(end)])) except: pass # Safety...if the dbfile didn't exist and we're on allchroms, then # default to generic complement if allchroms and len(chroms) == 0: allchroms = False if allchroms: chromReader = GenomicIntervalReader(chroms) generator = subtract([chromReader, g1]) else: generator = complement(g1, lens) out_file = open(out_fname, "w") try: for interval in generator: if type(interval) is GenomicInterval: out_file.write("%s\n" % "\t".join(interval)) else: out_file.write("%s\n" % interval) except ParseError, exc: out_file.close() fail("Invalid file format: %s" % str(exc))
def main(): allchroms = False options, args = doc_optparse.parse( __doc__ ) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) lengths = options.lengths if options.all: allchroms = True in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True ) lens = dict() chroms = list() # dbfile is used to determine the length of each chromosome. The lengths # are added to the lens dict and passed copmlement operation code in bx. dbfile = fileinput.FileInput( lengths ) if dbfile: if not allchroms: try: for line in dbfile: fields = line.split("\t") lens[fields[0]] = int(fields[1]) except: # assume LEN doesn't exist or is corrupt somehow pass elif allchroms: try: for line in dbfile: fields = line.split("\t") end = int(fields[1]) chroms.append("\t".join([fields[0], "0", str(end)])) except: pass # Safety...if the dbfile didn't exist and we're on allchroms, then # default to generic complement if allchroms and len(chroms) == 0: allchroms = False if allchroms: chromReader = GenomicIntervalReader(chroms) generator = subtract([chromReader, g1]) else: generator = complement(g1, lens) out_file = open( out_fname, "w" ) try: for interval in generator: if type( interval ) is GenomicInterval: out_file.write( "%s\n" % "\t".join( interval ) ) else: out_file.write( "%s\n" % interval ) except ParseError as exc: out_file.close() fail( "Invalid file format: %s" % str( exc ) ) out_file.close() if g1.skipped > 0: print(skipped( g1, filedesc="" ))
out_file = open(out_fname, "w") try: for outfields in join(g1, g2, mincols=mincols, rightfill=rightfill, leftfill=leftfill): if type(outfields) is list: out_file.write("%s\n" % "\t".join(outfields)) else: out_file.write("%s\n" % outfields) except ParseError, exc: out_file.close() fail("Invalid file format: %s" % str(exc)) except MemoryError: out_file.close() fail("Input datasets were too large to complete the join operation.") out_file.close() if g1.skipped > 0: print skipped(g1, filedesc=" of 1st dataset") if g2.skipped > 0: print skipped(g2, filedesc=" of 2nd dataset") if __name__ == "__main__": main()