def __main__():
    #
    # Parse options, args.
    #
    options, args = doc_optparse.parse( __doc__ )
    try:
        if len(options.cols.split(',')) == 5:
            # BED file
            chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg( options.cols )
        else:
            # gff file
            chrom_col, start_col, end_col, strand_col = parse_cols_arg( options.cols )
            name_col = False
        dbkey = options.dbkey
        output_format = options.output_format
        gff_format = options.gff
        interpret_features = options.interpret_features
        GALAXY_DATA_INDEX_DIR = options.GALAXY_DATA_INDEX_DIR
        fasta_file = options.fasta
        input_filename, output_filename = args
    except:
        doc_optparse.exception()

    includes_strand_col = strand_col >= 0
    strand = None
    nibs = {}

    #
    # Set path to sequence data.
    #
    if fasta_file:
        # Need to create 2bit file from fasta file.
        try:
            seq_path = tempfile.NamedTemporaryFile( dir="." ).name
            cmd = "faToTwoBit %s %s" % ( fasta_file, seq_path )

            tmp_name = tempfile.NamedTemporaryFile( dir="." ).name
            tmp_stderr = open( tmp_name, 'wb' )
            proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() )
            returncode = proc.wait()
            tmp_stderr.close()

            # Get stderr, allowing for case where it's very large.
            tmp_stderr = open( tmp_name, 'rb' )
            stderr = ''
            buffsize = 1048576
            try:
                while True:
                    stderr += tmp_stderr.read( buffsize )
                    if not stderr or len( stderr ) % buffsize != 0:
                        break
            except OverflowError:
                pass
            tmp_stderr.close()

            # Error checking.
            if returncode != 0:
                raise Exception(stderr)
        except Exception, e:
            stop_err( 'Error running faToTwoBit. ' + str( e ) )
def main():
    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
        in_fname, in2_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )
    g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
                            chrom_col=chr_col_2,
                            start_col=start_col_2,
                            end_col=end_col_2,
                            strand_col=strand_col_2,
                            fix_strand=True )

    out_file = open( out_fname, "w" )

    try:
        for line in coverage( [g1, g2] ):
            if type( line ) is GenomicInterval:
                out_file.write( "%s\n" % "\t".join( line.fields ) )
            else:
                out_file.write( "%s\n" % line )
    except ParseError, exc:
        out_file.close()
        fail( "Invalid file format: %s" % str( exc ) )
Beispiel #3
0
def main():
    mincols = 1

    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
        if options.mincols:
            mincols = int( options.mincols )
        pieces = bool( options.pieces )
        in1_gff_format = bool( options.gff1 )
        in2_gff_format = bool( options.gff2 )
        in_fname, in2_fname, out_fname = args
    except:
        doc_optparse.exception()

    # Set readers to handle either GFF or default format.
    if in1_gff_format:
        in1_reader_wrapper = GFFReaderWrapper
    else:
        in1_reader_wrapper = NiceReaderWrapper
    if in2_gff_format:
        in2_reader_wrapper = GFFReaderWrapper
    else:
        in2_reader_wrapper = NiceReaderWrapper

    g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ),
                             chrom_col=chr_col_1,
                             start_col=start_col_1,
                             end_col=end_col_1,
                             strand_col=strand_col_1,
                             fix_strand=True )
    if in1_gff_format:
        # Intersect requires coordinates in BED format.
        g1.convert_to_bed_coord = True
    g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ),
                             chrom_col=chr_col_2,
                             start_col=start_col_2,
                             end_col=end_col_2,
                             strand_col=strand_col_2,
                             fix_strand=True )
    if in2_gff_format:
        # Intersect requires coordinates in BED format.
        g2.convert_to_bed_coord = True

    out_file = open( out_fname, "w" )
    try:
        for feature in intersect( [g1, g2], pieces=pieces, mincols=mincols ):
            if isinstance( feature, GFFFeature ):
                # Convert back to GFF coordinates since reader converted automatically.
                convert_bed_coords_to_gff( feature )
                for interval in feature.intervals:
                    out_file.write( "%s\n" % "\t".join( interval.fields ) )
            elif isinstance( feature, GenomicInterval ):
                out_file.write( "%s\n" % "\t".join( feature.fields ) )
            else:
                out_file.write( "%s\n" % feature )
    except ParseError, e:
        out_file.close()
        fail( "Invalid file format: %s" % str( e ) )
Beispiel #4
0
def main():
    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg(
            options.cols2)
        in_fname, in2_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)
    g2 = NiceReaderWrapper(fileinput.FileInput(in2_fname),
                           chrom_col=chr_col_2,
                           start_col=start_col_2,
                           end_col=end_col_2,
                           strand_col=strand_col_2,
                           fix_strand=True)

    out_file = open(out_fname, "w")

    try:
        for line in coverage([g1, g2]):
            if type(line) is GenomicInterval:
                out_file.write("%s\n" % "\t".join(line.fields))
            else:
                out_file.write("%s\n" % line)
    except ParseError, exc:
        out_file.close()
        fail("Invalid file format: %s" % str(exc))
Beispiel #5
0
def main():
    mincols = 1
    leftfill = False
    rightfill = False

    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg(
            options.cols2)
        if options.mincols:
            mincols = int(options.mincols)
        if options.fill:
            if options.fill == "both":
                rightfill = leftfill = True
            else:
                rightfill = options.fill == "right"
                leftfill = options.fill == "left"
        in_fname, in2_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)
    g2 = NiceReaderWrapper(fileinput.FileInput(in2_fname),
                           chrom_col=chr_col_2,
                           start_col=start_col_2,
                           end_col=end_col_2,
                           strand_col=strand_col_2,
                           fix_strand=True)

    out_file = open(out_fname, "w")

    try:
        for outfields in join(g1,
                              g2,
                              mincols=mincols,
                              rightfill=rightfill,
                              leftfill=leftfill):
            if type(outfields) is list:
                out_file.write("%s\n" % "\t".join(outfields))
            else:
                out_file.write("%s\n" % outfields)
    except ParseError, exc:
        out_file.close()
        fail("Invalid file format: %s" % str(exc))
Beispiel #6
0
def main():
    distance = 0
    minregions = 2
    output = 1

    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        if options.distance:
            distance = int(options.distance)
        if options.overlap:
            distance = -1 * int(options.overlap)
        if options.output:
            output = int(options.output)
        if options.minregions:
            minregions = int(options.minregions)
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)

    # Get the cluster tree
    try:
        clusters, extra = find_clusters(g1,
                                        mincols=distance,
                                        minregions=minregions)
    except ParseError, exc:
        fail("Invalid file format: %s" % str(exc))
def main():
    distance = 0
    minregions = 2
    output = 1

    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        if options.distance:
            distance = int( options.distance )
        if options.overlap:
            distance = -1 * int( options.overlap )
        if options.output:
            output = int( options.output )
        if options.minregions:
            minregions = int( options.minregions )
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )

    # Get the cluster tree
    try:
        clusters, extra = find_clusters( g1, mincols=distance, minregions=minregions)
    except ParseError, exc:
        fail( "Invalid file format: %s" % str( exc ) )
Beispiel #8
0
def main():
    sameformat = False

    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
        if options.sameformat:
            sameformat = True
        in_file_1, in_file_2, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_file_1 ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )

    g2 = NiceReaderWrapper( fileinput.FileInput( in_file_2 ),
                            chrom_col=chr_col_2,
                            start_col=start_col_2,
                            end_col=end_col_2,
                            strand_col=strand_col_2,
                            fix_strand=True )

    out_file = open( out_fname, "w" )

    try:
        for line in concat( [g1, g2], sameformat=sameformat ):
            if type( line ) is GenomicInterval:
                out_file.write( "%s\n" % "\t".join( line.fields ) )
            else:
                out_file.write( "%s\n" % line )
    except ParseError as exc:
        out_file.close()
        fail( "Invalid file format: %s" % str( exc ) )

    out_file.close()

    if g1.skipped > 0:
        print(skipped( g1, filedesc=" of 1st dataset" ))
    if g2.skipped > 0:
        print(skipped( g2, filedesc=" of 2nd dataset" ))
Beispiel #9
0
def main():
    mincols = 1
    leftfill = False
    rightfill = False

    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
        if options.mincols:
            mincols = int( options.mincols )
        if options.fill:
            if options.fill == "both":
                rightfill = leftfill = True
            else:
                rightfill = options.fill == "right"
                leftfill = options.fill == "left"
        in_fname, in2_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )
    g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
                            chrom_col=chr_col_2,
                            start_col=start_col_2,
                            end_col=end_col_2,
                            strand_col=strand_col_2,
                            fix_strand=True )

    out_file = open( out_fname, "w" )

    try:
        for outfields in join(g1, g2, mincols=mincols, rightfill=rightfill, leftfill=leftfill):
            if type( outfields ) is list:
                out_file.write( "%s\n" % "\t".join( outfields ) )
            else:
                out_file.write( "%s\n" % outfields )
    except ParseError, exc:
        out_file.close()
        fail( "Invalid file format: %s" % str( exc ) )
Beispiel #10
0
def main():
    mincols = 1

    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        if options.mincols:
            mincols = int(options.mincols)
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)

    out_file = open(out_fname, "w")

    try:
        for line in merge(g1, mincols=mincols):
            if options.threecol:
                if type(line) is GenomicInterval:
                    out_file.write(
                        "%s\t%s\t%s\n" %
                        (line.chrom, str(line.startCol), str(line.endCol)))
                elif type(line) is list:
                    out_file.write("%s\t%s\t%s\n" %
                                   (line[chr_col_1], str(line[start_col_1]),
                                    str(line[end_col_1])))
                else:
                    out_file.write("%s\n" % line)
            else:
                if type(line) is GenomicInterval:
                    out_file.write("%s\n" % "\t".join(line.fields))
                elif type(line) is list:
                    out_file.write("%s\n" % "\t".join(line))
                else:
                    out_file.write("%s\n" % line)
    except ParseError as exc:
        out_file.close()
        fail("Invalid file format: %s" % str(exc))

    out_file.close()

    if g1.skipped > 0:
        print(skipped(g1, filedesc=" of 1st dataset"))
def main():
    mincols = 1

    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        if options.mincols:
            mincols = int( options.mincols )
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )

    out_file = open( out_fname, "w" )

    try:
        for line in merge(g1, mincols=mincols):
            if options.threecol:
                if type( line ) is GenomicInterval:
                    out_file.write( "%s\t%s\t%s\n" % ( line.chrom, str( line.startCol ), str( line.endCol ) ) )
                elif type( line ) is list:
                    out_file.write( "%s\t%s\t%s\n" % ( line[chr_col_1], str( line[start_col_1] ), str( line[end_col_1] ) ) )
                else:
                    out_file.write( "%s\n" % line )
            else:
                if type( line ) is GenomicInterval:
                    out_file.write( "%s\n" % "\t".join( line.fields ) )
                elif type( line ) is list:
                    out_file.write( "%s\n" % "\t".join( line ) )
                else:
                    out_file.write( "%s\n" % line )
    except ParseError as exc:
        out_file.close()
        fail( "Invalid file format: %s" % str( exc ) )

    out_file.close()

    if g1.skipped > 0:
        print(skipped( g1, filedesc=" of 1st dataset" ))
def main():
    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )

    try:
        bases = base_coverage(g1)
    except ParseError, exc:
        fail( "Invalid file format: %s" % str( exc ) )
Beispiel #13
0
def main():
    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)

    try:
        bases = base_coverage(g1)
    except ParseError, exc:
        fail("Invalid file format: %s" % str(exc))
def main():
    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )

    try:
        bases = base_coverage(g1)
    except ParseError as exc:
        fail( "Invalid file format: %s" % str( exc ) )
    out_file = open( out_fname, "w" )
    out_file.write( "%s\n" % str( bases ) )
    out_file.close()
    if g1.skipped > 0:
        print(skipped( g1, filedesc="" ))
Beispiel #15
0
def stop_err(msg):
    sys.stderr.write(msg)
    sys.exit()


if len(sys.argv) < 3:
    stop_err("Incorrect number of arguments.")

inp_file = sys.argv[1]
out_file = sys.argv[2]
fout = open(out_file, 'w')
int_file = sys.argv[3]
if int_file != "None":     #The user has specified an interval file
    dbkey_i = sys.argv[4]
    chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] )


def rateEstimator(block):
    global alignlen, mismatches

    src1 = block.components[0].src
    sequence1 = block.components[0].text
    start1 = block.components[0].start
    end1 = block.components[0].end
    len1 = int(end1)-int(start1)
    len1_withgap = len(sequence1)
    mismatch = 0.0
    
    for seq in range (1, len(block.components)):
        src2 = block.components[seq].src
def __main__():
    #
    # Parse options, args.
    #
    options, args = doc_optparse.parse(__doc__)
    try:
        if len(options.cols.split(',')) == 5:
            # BED file
            chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg(
                options.cols)
        else:
            # gff file
            chrom_col, start_col, end_col, strand_col = parse_cols_arg(
                options.cols)
            name_col = False
        dbkey = options.dbkey
        output_format = options.output_format
        gff_format = options.gff
        interpret_features = options.interpret_features
        GALAXY_DATA_INDEX_DIR = options.GALAXY_DATA_INDEX_DIR
        fasta_file = options.fasta
        input_filename, output_filename = args
    except:
        doc_optparse.exception()

    includes_strand_col = strand_col >= 0
    strand = None
    nibs = {}

    #
    # Set path to sequence data.
    #
    if fasta_file:
        # Need to create 2bit file from fasta file.
        try:
            seq_path = tempfile.NamedTemporaryFile(dir=".").name
            cmd = "faToTwoBit %s %s" % (fasta_file, seq_path)

            tmp_name = tempfile.NamedTemporaryFile(dir=".").name
            tmp_stderr = open(tmp_name, 'wb')
            proc = subprocess.Popen(args=cmd,
                                    shell=True,
                                    stderr=tmp_stderr.fileno())
            returncode = proc.wait()
            tmp_stderr.close()

            # Get stderr, allowing for case where it's very large.
            tmp_stderr = open(tmp_name, 'rb')
            stderr = ''
            buffsize = 1048576
            try:
                while True:
                    stderr += tmp_stderr.read(buffsize)
                    if not stderr or len(stderr) % buffsize != 0:
                        break
            except OverflowError:
                pass
            tmp_stderr.close()

            # Error checking.
            if returncode != 0:
                raise Exception(stderr)
        except Exception as e:
            stop_err('Error running faToTwoBit. ' + str(e))
    else:
        seq_path = check_seq_file(dbkey, GALAXY_DATA_INDEX_DIR)
        if not os.path.exists(seq_path):
            # If this occurs, we need to fix the metadata validator.
            stop_err(
                "No sequences are available for '%s', request them by reporting this error."
                % dbkey)

    #
    # Fetch sequences.
    #

    # Get feature's line(s).
    def get_lines(feature):
        if isinstance(feature, gff_util.GFFFeature):
            return feature.lines()
        else:
            return [feature.rstrip('\r\n')]

    skipped_lines = 0
    first_invalid_line = 0
    invalid_lines = []
    fout = open(output_filename, "w")
    warnings = []
    warning = ''
    twobitfile = None
    file_iterator = open(input_filename)
    if gff_format and interpret_features:
        file_iterator = gff_util.GFFReaderWrapper(file_iterator,
                                                  fix_strand=False)
    line_count = 1
    for feature in file_iterator:
        # Ignore comments, headers.
        if isinstance(feature, (Header, Comment)):
            line_count += 1
            continue

        name = ""
        if gff_format and interpret_features:
            # Processing features.
            gff_util.convert_gff_coords_to_bed(feature)
            chrom = feature.chrom
            start = feature.start
            end = feature.end
            strand = feature.strand
        else:
            # Processing lines, either interval or GFF format.
            line = feature.rstrip('\r\n')
            if line and not line.startswith("#"):
                fields = line.split('\t')
                try:
                    chrom = fields[chrom_col]
                    start = int(fields[start_col])
                    end = int(fields[end_col])
                    if name_col:
                        name = fields[name_col]
                    if gff_format:
                        start, end = gff_util.convert_gff_coords_to_bed(
                            [start, end])
                    if includes_strand_col:
                        strand = fields[strand_col]
                except:
                    warning = "Invalid chrom, start or end column values. "
                    warnings.append(warning)
                    if not invalid_lines:
                        invalid_lines = get_lines(feature)
                        first_invalid_line = line_count
                    skipped_lines += len(invalid_lines)
                    continue
                if start > end:
                    warning = "Invalid interval, start '%d' > end '%d'.  " % (
                        start, end)
                    warnings.append(warning)
                    if not invalid_lines:
                        invalid_lines = get_lines(feature)
                        first_invalid_line = line_count
                    skipped_lines += len(invalid_lines)
                    continue

                if strand not in ['+', '-']:
                    strand = '+'
                sequence = ''
            else:
                continue

        # Open sequence file and get sequence for feature/interval.
        if seq_path and os.path.exists("%s/%s.nib" % (seq_path, chrom)):
            # TODO: improve support for GFF-nib interaction.
            if chrom in nibs:
                nib = nibs[chrom]
            else:
                nibs[chrom] = nib = bx.seq.nib.NibFile(
                    open("%s/%s.nib" % (seq_path, chrom)))
            try:
                sequence = nib.get(start, end - start)
            except Exception as e:
                warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (
                    start, end - start, dbkey)
                warnings.append(warning)
                if not invalid_lines:
                    invalid_lines = get_lines(feature)
                    first_invalid_line = line_count
                skipped_lines += len(invalid_lines)
                continue
        elif seq_path and os.path.isfile(seq_path):
            if not (twobitfile):
                twobitfile = bx.seq.twobit.TwoBitFile(open(seq_path))
            try:
                if options.gff and interpret_features:
                    # Create sequence from intervals within a feature.
                    sequence = ''
                    for interval in feature.intervals:
                        sequence += twobitfile[
                            interval.chrom][interval.start:interval.end]
                else:
                    sequence = twobitfile[chrom][start:end]
            except:
                warning = "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " % (
                    start, end - start, chrom)
                warnings.append(warning)
                if not invalid_lines:
                    invalid_lines = get_lines(feature)
                    first_invalid_line = line_count
                skipped_lines += len(invalid_lines)
                continue
        else:
            warning = "Chromosome by name '%s' was not found for build '%s'. " % (
                chrom, dbkey)
            warnings.append(warning)
            if not invalid_lines:
                invalid_lines = get_lines(feature)
                first_invalid_line = line_count
            skipped_lines += len(invalid_lines)
            continue
        if sequence == '':
            warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " % \
                ( chrom, start, end, dbkey )
            warnings.append(warning)
            if not invalid_lines:
                invalid_lines = get_lines(feature)
                first_invalid_line = line_count
            skipped_lines += len(invalid_lines)
            continue
        if includes_strand_col and strand == "-":
            sequence = reverse_complement(sequence)

        if output_format == "fasta":
            l = len(sequence)
            c = 0
            if gff_format:
                start, end = gff_util.convert_bed_coords_to_gff([start, end])
            fields = [dbkey, str(chrom), str(start), str(end), strand]
            meta_data = "_".join(fields)
            if name.strip():
                fout.write(">%s %s\n" % (meta_data, name))
            else:
                fout.write(">%s\n" % meta_data)
            while c < l:
                b = min(c + 50, l)
                fout.write("%s\n" % str(sequence[c:b]))
                c = b
        else:  # output_format == "interval"
            if gff_format and interpret_features:
                # TODO: need better GFF Reader to capture all information needed
                # to produce this line.
                meta_data = "\t".join([
                    feature.chrom, "galaxy_extract_genomic_dna", "interval",
                    str(feature.start),
                    str(feature.end), feature.score, feature.strand, ".",
                    gff_util.gff_attributes_to_str(feature.attributes, "GTF")
                ])
            else:
                meta_data = "\t".join(fields)
            if gff_format:
                format_str = "%s seq \"%s\";\n"
            else:
                format_str = "%s\t%s\n"
            fout.write(format_str % (meta_data, str(sequence)))

        # Update line count.
        if isinstance(feature, gff_util.GFFFeature):
            line_count += len(feature.intervals)
        else:
            line_count += 1

    fout.close()

    if warnings:
        warn_msg = "%d warnings, 1st is: " % len(warnings)
        warn_msg += warnings[0]
        print warn_msg
    if skipped_lines:
        # Error message includes up to the first 10 skipped lines.
        print 'Skipped %d invalid lines, 1st is #%d, "%s"' % (
            skipped_lines, first_invalid_line, '\n'.join(invalid_lines[:10]))

    # Clean up temp file.
    if fasta_file:
        os.remove(seq_path)
        os.remove(tmp_name)
Beispiel #17
0
def __main__():
    #
    # Parse options, args.
    #
    options, args = doc_optparse.parse( __doc__ )
    try:
        if len(options.cols.split(',')) == 5:
            # BED file
            chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg( options.cols )
        else:
            # gff file
            chrom_col, start_col, end_col, strand_col = parse_cols_arg( options.cols )
            name_col = False
        dbkey = options.dbkey
        output_format = options.output_format
        gff_format = options.gff
        interpret_features = options.interpret_features
        GALAXY_DATA_INDEX_DIR = options.GALAXY_DATA_INDEX_DIR
        fasta_file = options.fasta
        input_filename, output_filename = args
    except:
        doc_optparse.exception()

    includes_strand_col = strand_col >= 0
    strand = None
    nibs = {}

    #
    # Set path to sequence data.
    #
    if fasta_file:
        # Need to create 2bit file from fasta file.
        try:
            seq_path = tempfile.NamedTemporaryFile( dir="." ).name
            cmd = "faToTwoBit %s %s" % ( fasta_file, seq_path )

            tmp_name = tempfile.NamedTemporaryFile( dir="." ).name
            tmp_stderr = open( tmp_name, 'wb' )
            proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() )
            returncode = proc.wait()
            tmp_stderr.close()

            # Get stderr, allowing for case where it's very large.
            tmp_stderr = open( tmp_name, 'rb' )
            stderr = ''
            buffsize = 1048576
            try:
                while True:
                    stderr += tmp_stderr.read( buffsize )
                    if not stderr or len( stderr ) % buffsize != 0:
                        break
            except OverflowError:
                pass
            tmp_stderr.close()

            # Error checking.
            if returncode != 0:
                raise Exception(stderr)
        except Exception as e:
            stop_err( 'Error running faToTwoBit. ' + str( e ) )
    else:
        seq_path = check_seq_file( dbkey, GALAXY_DATA_INDEX_DIR )
        if not os.path.exists( seq_path ):
            # If this occurs, we need to fix the metadata validator.
            stop_err( "No sequences are available for '%s', request them by reporting this error." % dbkey )

    #
    # Fetch sequences.
    #

    # Get feature's line(s).
    def get_lines( feature ):
        if isinstance( feature, gff_util.GFFFeature ):
            return feature.lines()
        else:
            return [ feature.rstrip( '\r\n' ) ]

    skipped_lines = 0
    first_invalid_line = 0
    invalid_lines = []
    fout = open( output_filename, "w" )
    warnings = []
    warning = ''
    twobitfile = None
    file_iterator = open( input_filename )
    if gff_format and interpret_features:
        file_iterator = gff_util.GFFReaderWrapper( file_iterator, fix_strand=False )
    line_count = 1
    for feature in file_iterator:
        # Ignore comments, headers.
        if isinstance( feature, ( Header, Comment ) ):
            line_count += 1
            continue

        name = ""
        if gff_format and interpret_features:
            # Processing features.
            gff_util.convert_gff_coords_to_bed( feature )
            chrom = feature.chrom
            start = feature.start
            end = feature.end
            strand = feature.strand
        else:
            # Processing lines, either interval or GFF format.
            line = feature.rstrip( '\r\n' )
            if line and not line.startswith( "#" ):
                fields = line.split( '\t' )
                try:
                    chrom = fields[chrom_col]
                    start = int( fields[start_col] )
                    end = int( fields[end_col] )
                    if name_col:
                        name = fields[name_col]
                    if gff_format:
                        start, end = gff_util.convert_gff_coords_to_bed( [start, end] )
                    if includes_strand_col:
                        strand = fields[strand_col]
                except:
                    warning = "Invalid chrom, start or end column values. "
                    warnings.append( warning )
                    if not invalid_lines:
                        invalid_lines = get_lines( feature )
                        first_invalid_line = line_count
                    skipped_lines += len( invalid_lines )
                    continue
                if start > end:
                    warning = "Invalid interval, start '%d' > end '%d'.  " % ( start, end )
                    warnings.append( warning )
                    if not invalid_lines:
                        invalid_lines = get_lines( feature )
                        first_invalid_line = line_count
                    skipped_lines += len( invalid_lines )
                    continue

                if strand not in ['+', '-']:
                    strand = '+'
                sequence = ''
            else:
                continue

        # Open sequence file and get sequence for feature/interval.
        if seq_path and os.path.exists( "%s/%s.nib" % ( seq_path, chrom ) ):
            # TODO: improve support for GFF-nib interaction.
            if chrom in nibs:
                nib = nibs[chrom]
            else:
                nibs[chrom] = nib = bx.seq.nib.NibFile( open( "%s/%s.nib" % ( seq_path, chrom ) ) )
            try:
                sequence = nib.get( start, end - start )
            except Exception as e:
                warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % ( start, end - start, dbkey )
                warnings.append( warning )
                if not invalid_lines:
                    invalid_lines = get_lines( feature )
                    first_invalid_line = line_count
                skipped_lines += len( invalid_lines )
                continue
        elif seq_path and os.path.isfile( seq_path ):
            if not(twobitfile):
                twobitfile = bx.seq.twobit.TwoBitFile( open( seq_path ) )
            try:
                if options.gff and interpret_features:
                    # Create sequence from intervals within a feature.
                    sequence = ''
                    for interval in feature.intervals:
                        sequence += twobitfile[interval.chrom][interval.start:interval.end]
                else:
                    sequence = twobitfile[chrom][start:end]
            except:
                warning = "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " % ( start, end - start, chrom )
                warnings.append( warning )
                if not invalid_lines:
                    invalid_lines = get_lines( feature )
                    first_invalid_line = line_count
                skipped_lines += len( invalid_lines )
                continue
        else:
            warning = "Chromosome by name '%s' was not found for build '%s'. " % ( chrom, dbkey )
            warnings.append( warning )
            if not invalid_lines:
                invalid_lines = get_lines( feature )
                first_invalid_line = line_count
            skipped_lines += len( invalid_lines )
            continue
        if sequence == '':
            warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " % \
                ( chrom, start, end, dbkey )
            warnings.append( warning )
            if not invalid_lines:
                invalid_lines = get_lines( feature )
                first_invalid_line = line_count
            skipped_lines += len( invalid_lines )
            continue
        if includes_strand_col and strand == "-":
            sequence = reverse_complement( sequence )

        if output_format == "fasta":
            l = len( sequence )
            c = 0
            if gff_format:
                start, end = gff_util.convert_bed_coords_to_gff( [ start, end ] )
            fields = [dbkey, str( chrom ), str( start ), str( end ), strand]
            meta_data = "_".join( fields )
            if name.strip():
                fout.write( ">%s %s\n" % (meta_data, name) )
            else:
                fout.write( ">%s\n" % meta_data )
            while c < l:
                b = min( c + 50, l )
                fout.write( "%s\n" % str( sequence[c:b] ) )
                c = b
        else:  # output_format == "interval"
            if gff_format and interpret_features:
                # TODO: need better GFF Reader to capture all information needed
                # to produce this line.
                meta_data = "\t".join(
                    [feature.chrom, "galaxy_extract_genomic_dna", "interval",
                    str( feature.start ), str( feature.end ), feature.score, feature.strand,
                    ".", gff_util.gff_attributes_to_str( feature.attributes, "GTF" ) ] )
            else:
                meta_data = "\t".join( fields )
            if gff_format:
                format_str = "%s seq \"%s\";\n"
            else:
                format_str = "%s\t%s\n"
            fout.write( format_str % ( meta_data, str( sequence ) ) )

        # Update line count.
        if isinstance( feature, gff_util.GFFFeature ):
            line_count += len( feature.intervals )
        else:
            line_count += 1

    fout.close()

    if warnings:
        warn_msg = "%d warnings, 1st is: " % len( warnings )
        warn_msg += warnings[0]
        print(warn_msg)
    if skipped_lines:
        # Error message includes up to the first 10 skipped lines.
        print('Skipped %d invalid lines, 1st is #%d, "%s"' % ( skipped_lines, first_invalid_line, '\n'.join( invalid_lines[:10] ) ))

    # Clean up temp file.
    if fasta_file:
        os.remove( seq_path )
        os.remove( tmp_name )
Beispiel #18
0
def main():
    try:
        if int( sys.argv[3] ) < 0:
            raise Exception
    except:
        stop_err( "Length of flanking region(s) must be a non-negative integer." )

    # Parsing Command Line here
    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols )
        inp_file, out_file, size, direction, region = args
        if strand_col_1 <= 0:
            strand = "+"  # if strand is not defined, default it to +
    except:
        stop_err( "Metadata issue, correct the metadata attributes by clicking on the pencil icon in the history item." )
    try:
        offset = int(options.off)
        size = int(size)
    except:
        stop_err( "Invalid offset or length entered. Try again by entering valid integer values." )

    fo = open(out_file, 'w')

    skipped_lines = 0
    first_invalid_line = 0
    invalid_line = None
    elems = []
    j = 0
    for i, line in enumerate( file( inp_file ) ):
        line = line.strip()
        if line and (not line.startswith( '#' )) and line != '':
            j += 1
            try:
                elems = line.split('\t')
                #if the start and/or end columns are not numbers, skip that line.
                assert int(elems[start_col_1])
                assert int(elems[end_col_1])
                if strand_col_1 != -1:
                    strand = elems[strand_col_1]
                #if the stand value is not + or -, skip that line.
                assert strand in ['+', '-']
                if direction == 'Upstream':
                    if strand == '+':
                        if region == 'end':
                            elems[end_col_1] = str(int(elems[end_col_1]) + offset)
                            elems[start_col_1] = str( int(elems[end_col_1]) - size )
                        else:
                            elems[end_col_1] = str(int(elems[start_col_1]) + offset)
                            elems[start_col_1] = str( int(elems[end_col_1]) - size )
                    elif strand == '-':
                        if region == 'end':
                            elems[start_col_1] = str(int(elems[start_col_1]) - offset)
                            elems[end_col_1] = str(int(elems[start_col_1]) + size)
                        else:
                            elems[start_col_1] = str(int(elems[end_col_1]) - offset)
                            elems[end_col_1] = str(int(elems[start_col_1]) + size)
                    assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0
                    fo.write( "%s\n" % '\t'.join( elems ) )

                elif direction == 'Downstream':
                    if strand == '-':
                        if region == 'start':
                            elems[end_col_1] = str(int(elems[end_col_1]) - offset)
                            elems[start_col_1] = str( int(elems[end_col_1]) - size )
                        else:
                            elems[end_col_1] = str(int(elems[start_col_1]) - offset)
                            elems[start_col_1] = str( int(elems[end_col_1]) - size )
                    elif strand == '+':
                        if region == 'start':
                            elems[start_col_1] = str(int(elems[start_col_1]) + offset)
                            elems[end_col_1] = str(int(elems[start_col_1]) + size)
                        else:
                            elems[start_col_1] = str(int(elems[end_col_1]) + offset)
                            elems[end_col_1] = str(int(elems[start_col_1]) + size)
                    assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0
                    fo.write( "%s\n" % '\t'.join( elems ) )

                elif direction == 'Both':
                    if strand == '-':
                        if region == 'start':
                            start = str(int(elems[end_col_1]) - offset)
                            end1 = str(int(start) + size)
                            end2 = str(int(start) - size)
                            elems[start_col_1] = start
                            elems[end_col_1] = end1
                            assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0
                            fo.write( "%s\n" % '\t'.join( elems ) )
                            elems[start_col_1] = end2
                            elems[end_col_1] = start
                            assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0
                            fo.write( "%s\n" % '\t'.join( elems ) )
                        elif region == 'end':
                            start = str(int(elems[start_col_1]) - offset)
                            end1 = str(int(start) + size)
                            end2 = str(int(start) - size)
                            elems[start_col_1] = start
                            elems[end_col_1] = end1
                            assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0
                            fo.write( "%s\n" % '\t'.join( elems ) )
                            elems[start_col_1] = end2
                            elems[end_col_1] = start
                            assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0
                            fo.write( "%s\n" % '\t'.join( elems ) )
                        else:
                            start1 = str(int(elems[end_col_1]) - offset)
                            end1 = str(int(start1) + size)
                            start2 = str(int(elems[start_col_1]) - offset)
                            end2 = str(int(start2) - size)
                            elems[start_col_1] = start1
                            elems[end_col_1] = end1
                            assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0
                            fo.write( "%s\n" % '\t'.join( elems ) )
                            elems[start_col_1] = end2
                            elems[end_col_1] = start2
                            assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0
                            fo.write( "%s\n" % '\t'.join( elems ) )
                    elif strand == '+':
                        if region == 'start':
                            start = str(int(elems[start_col_1]) + offset)
                            end1 = str(int(start) - size)
                            end2 = str(int(start) + size)
                            elems[start_col_1] = end1
                            elems[end_col_1] = start
                            assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0
                            fo.write( "%s\n" % '\t'.join( elems ) )
                            elems[start_col_1] = start
                            elems[end_col_1] = end2
                            assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0
                            fo.write( "%s\n" % '\t'.join( elems ) )
                        elif region == 'end':
                            start = str(int(elems[end_col_1]) + offset)
                            end1 = str(int(start) - size)
                            end2 = str(int(start) + size)
                            elems[start_col_1] = end1
                            elems[end_col_1] = start
                            assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0
                            fo.write( "%s\n" % '\t'.join( elems ) )
                            elems[start_col_1] = start
                            elems[end_col_1] = end2
                            assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0
                            fo.write( "%s\n" % '\t'.join( elems ) )
                        else:
                            start1 = str(int(elems[start_col_1]) + offset)
                            end1 = str(int(start1) - size)
                            start2 = str(int(elems[end_col_1]) + offset)
                            end2 = str(int(start2) + size)
                            elems[start_col_1] = end1
                            elems[end_col_1] = start1
                            assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0
                            fo.write( "%s\n" % '\t'.join( elems ) )
                            elems[start_col_1] = start2
                            elems[end_col_1] = end2
                            assert int(elems[start_col_1]) > 0 and int(elems[end_col_1]) > 0
                            fo.write( "%s\n" % '\t'.join( elems ) )
            except:
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
    fo.close()

    if skipped_lines == j:
        stop_err( "Data issue: click the pencil icon in the history item to correct the metadata attributes." )
    if skipped_lines > 0:
        print 'Skipped %d invalid lines starting with #%dL "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
    print 'Location: %s, Region: %s, Flank-length: %d, Offset: %d ' % ( direction, region, size, offset )
def main():
    distance = 0
    minregions = 2
    output = 1

    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        if options.distance:
            distance = int( options.distance )
        if options.overlap:
            distance = -1 * int( options.overlap )
        if options.output:
            output = int( options.output )
        if options.minregions:
            minregions = int( options.minregions )
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )

    # Get the cluster tree
    try:
        clusters, extra = find_clusters( g1, mincols=distance, minregions=minregions)
    except ParseError as exc:
        fail( "Invalid file format: %s" % str( exc ) )

    f1 = open( in_fname, "r" )
    out_file = open( out_fname, "w" )

    # If "merge"
    if output == 1:
        fields = ["." for x in range(max(g1.chrom_col, g1.start_col, g1.end_col) + 1)]
        for chrom, tree in clusters.items():
            for start, end, lines in tree.getregions():
                fields[g1.chrom_col] = chrom
                fields[g1.start_col] = str(start)
                fields[g1.end_col] = str(end)
                out_file.write( "%s\n" % "\t".join( fields ) )

    # If "filtered" we preserve order of file and comments, etc.
    if output == 2:
        linenums = dict()
        for chrom, tree in clusters.items():
            for linenum in tree.getlines():
                linenums[linenum] = 0
        linenum = -1
        f1.seek(0)
        for line in f1.readlines():
            linenum += 1
            if linenum in linenums or linenum in extra:
                out_file.write( "%s\n" % line.rstrip( "\n\r" ) )

    # If "clustered" we output original intervals, but near each other (i.e. clustered)
    if output == 3:
        linenums = list()
        f1.seek(0)
        fileLines = f1.readlines()
        for chrom, tree in clusters.items():
            for linenum in tree.getlines():
                out_file.write( "%s\n" % fileLines[linenum].rstrip( "\n\r" ) )

    # If "minimum" we output the smallest interval in each cluster
    if output == 4 or output == 5:
        linenums = list()
        f1.seek(0)
        fileLines = f1.readlines()
        for chrom, tree in clusters.items():
            for start, end, lines in tree.getregions():
                outsize = -1
                outinterval = None
                for line in lines:
                    # three nested for loops?
                    # should only execute this code once per line
                    fileline = fileLines[line].rstrip("\n\r")
                    try:
                        cluster_interval = GenomicInterval( g1, fileline.split("\t"),
                                                            g1.chrom_col,
                                                            g1.start_col,
                                                            g1.end_col,
                                                            g1.strand_col,
                                                            g1.default_strand,
                                                            g1.fix_strand )
                    except Exception as exc:
                        print(str( exc ), file=sys.stderr)
                        f1.close()
                        sys.exit()
                    interval_size = cluster_interval.end - cluster_interval.start
                    if outsize == -1 or \
                       ( outsize > interval_size and output == 4 ) or \
                       ( outsize < interval_size and output == 5 ):
                        outinterval = cluster_interval
                        outsize = interval_size
                out_file.write( "%s\n" % outinterval )

    f1.close()
    out_file.close()

    if g1.skipped > 0:
        print(skipped( g1, filedesc="" ))
Beispiel #20
0
def main():
    allchroms = False

    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        lengths = options.lengths
        if options.all:
            allchroms = True
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)

    lens = dict()
    chroms = list()
    # dbfile is used to determine the length of each chromosome.  The lengths
    # are added to the lens dict and passed copmlement operation code in bx.
    dbfile = fileinput.FileInput(lengths)

    if dbfile:
        if not allchroms:
            try:
                for line in dbfile:
                    fields = line.split("\t")
                    lens[fields[0]] = int(fields[1])
            except:
                # assume LEN doesn't exist or is corrupt somehow
                pass
        elif allchroms:
            try:
                for line in dbfile:
                    fields = line.split("\t")
                    end = int(fields[1])
                    chroms.append("\t".join([fields[0], "0", str(end)]))
            except:
                pass

    # Safety...if the dbfile didn't exist and we're on allchroms, then
    # default to generic complement
    if allchroms and len(chroms) == 0:
        allchroms = False

    if allchroms:
        chromReader = GenomicIntervalReader(chroms)
        generator = subtract([chromReader, g1])
    else:
        generator = complement(g1, lens)

    out_file = open(out_fname, "w")

    try:
        for interval in generator:
            if type(interval) is GenomicInterval:
                out_file.write("%s\n" % "\t".join(interval))
            else:
                out_file.write("%s\n" % interval)
    except ParseError, exc:
        out_file.close()
        fail("Invalid file format: %s" % str(exc))
Beispiel #21
0
def main():
    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg(
            options.cols2)
        in1_gff_format = bool(options.gff1)
        in2_gff_format = bool(options.gff2)
        in_fname, in2_fname, out_fname, direction = args
    except:
        doc_optparse.exception()

    # Set readers to handle either GFF or default format.
    if in1_gff_format:
        in1_reader_wrapper = GFFIntervalToBEDReaderWrapper
    else:
        in1_reader_wrapper = NiceReaderWrapper
    if in2_gff_format:
        in2_reader_wrapper = GFFIntervalToBEDReaderWrapper
    else:
        in2_reader_wrapper = NiceReaderWrapper

    g1 = in1_reader_wrapper(fileinput.FileInput(in_fname),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True)
    g2 = in2_reader_wrapper(fileinput.FileInput(in2_fname),
                            chrom_col=chr_col_2,
                            start_col=start_col_2,
                            end_col=end_col_2,
                            strand_col=strand_col_2,
                            fix_strand=True)

    # Find flanking features.
    out_file = open(out_fname, "w")
    try:
        for result in proximal_region_finder([g1, g2], direction):
            if type(result) is list:
                line, closest_feature = result
                # Need to join outputs differently depending on file types.
                if in1_gff_format:
                    # Output is GFF with added attribute 'closest feature.'

                    # Invervals are in BED coordinates; need to convert to GFF.
                    line = convert_bed_coords_to_gff(line)
                    closest_feature = convert_bed_coords_to_gff(
                        closest_feature)

                    # Replace double quotes with single quotes in closest feature's attributes.
                    out_file.write(
                        "%s closest_feature \"%s\" \n" %
                        ("\t".join(line.fields), "\t".join(
                            closest_feature.fields).replace("\"", "\\\"")))
                else:
                    # Output is BED + closest feature fields.
                    output_line_fields = []
                    output_line_fields.extend(line.fields)
                    output_line_fields.extend(closest_feature.fields)
                    out_file.write("%s\n" % ("\t".join(output_line_fields)))
            else:
                out_file.write("%s\n" % result)
    except ParseError as exc:
        fail("Invalid file format: %s" % str(exc))

    print("Direction: %s" % (direction))
    if g1.skipped > 0:
        print(skipped(g1, filedesc=" of 1st dataset"))
    if g2.skipped > 0:
        print(skipped(g2, filedesc=" of 2nd dataset"))
def main():
    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
        in1_gff_format = bool( options.gff1 )
        in2_gff_format = bool( options.gff2 )
        in_fname, in2_fname, out_fname, direction = args
    except:
        doc_optparse.exception()

    # Set readers to handle either GFF or default format.
    if in1_gff_format:
        in1_reader_wrapper = GFFIntervalToBEDReaderWrapper
    else:
        in1_reader_wrapper = NiceReaderWrapper
    if in2_gff_format:
        in2_reader_wrapper = GFFIntervalToBEDReaderWrapper
    else:
        in2_reader_wrapper = NiceReaderWrapper

    g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ),
                             chrom_col=chr_col_1,
                             start_col=start_col_1,
                             end_col=end_col_1,
                             strand_col=strand_col_1,
                             fix_strand=True )
    g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ),
                             chrom_col=chr_col_2,
                             start_col=start_col_2,
                             end_col=end_col_2,
                             strand_col=strand_col_2,
                             fix_strand=True )

    # Find flanking features.
    out_file = open( out_fname, "w" )
    try:
        for result in proximal_region_finder([g1, g2], direction):
            if type( result ) is list:
                line, closest_feature = result
                # Need to join outputs differently depending on file types.
                if in1_gff_format:
                    # Output is GFF with added attribute 'closest feature.'

                    # Invervals are in BED coordinates; need to convert to GFF.
                    line = convert_bed_coords_to_gff( line )
                    closest_feature = convert_bed_coords_to_gff( closest_feature )

                    # Replace double quotes with single quotes in closest feature's attributes.
                    out_file.write( "%s closest_feature \"%s\" \n" %
                                    ( "\t".join( line.fields ),
                                      "\t".join( closest_feature.fields ).replace( "\"", "\\\"" )
                                      ) )
                else:
                    # Output is BED + closest feature fields.
                    output_line_fields = []
                    output_line_fields.extend( line.fields )
                    output_line_fields.extend( closest_feature.fields )
                    out_file.write( "%s\n" % ( "\t".join( output_line_fields ) ) )
            else:
                out_file.write( "%s\n" % result )
    except ParseError as exc:
        fail( "Invalid file format: %s" % str( exc ) )

    print("Direction: %s" % (direction))
    if g1.skipped > 0:
        print(skipped( g1, filedesc=" of 1st dataset" ))
    if g2.skipped > 0:
        print(skipped( g2, filedesc=" of 2nd dataset" ))
def main():
    allchroms = False

    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        lengths = options.lengths
        if options.all:
            allchroms = True
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )

    lens = dict()
    chroms = list()
    # dbfile is used to determine the length of each chromosome.  The lengths
    # are added to the lens dict and passed copmlement operation code in bx.
    dbfile = fileinput.FileInput( lengths )

    if dbfile:
        if not allchroms:
            try:
                for line in dbfile:
                    fields = line.split("\t")
                    lens[fields[0]] = int(fields[1])
            except:
                # assume LEN doesn't exist or is corrupt somehow
                pass
        elif allchroms:
            try:
                for line in dbfile:
                    fields = line.split("\t")
                    end = int(fields[1])
                    chroms.append("\t".join([fields[0], "0", str(end)]))
            except:
                pass

    # Safety...if the dbfile didn't exist and we're on allchroms, then
    # default to generic complement
    if allchroms and len(chroms) == 0:
        allchroms = False

    if allchroms:
        chromReader = GenomicIntervalReader(chroms)
        generator = subtract([chromReader, g1])
    else:
        generator = complement(g1, lens)

    out_file = open( out_fname, "w" )

    try:
        for interval in generator:
            if type( interval ) is GenomicInterval:
                out_file.write( "%s\n" % "\t".join( interval ) )
            else:
                out_file.write( "%s\n" % interval )
    except ParseError as exc:
        out_file.close()
        fail( "Invalid file format: %s" % str( exc ) )

    out_file.close()

    if g1.skipped > 0:
        print(skipped( g1, filedesc="" ))
Beispiel #24
0
def main():
    distance = 0
    minregions = 2
    output = 1

    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        if options.distance:
            distance = int(options.distance)
        if options.overlap:
            distance = -1 * int(options.overlap)
        if options.output:
            output = int(options.output)
        if options.minregions:
            minregions = int(options.minregions)
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)

    # Get the cluster tree
    try:
        clusters, extra = find_clusters(g1,
                                        mincols=distance,
                                        minregions=minregions)
    except ParseError as exc:
        fail("Invalid file format: %s" % str(exc))

    f1 = open(in_fname, "r")
    out_file = open(out_fname, "w")

    # If "merge"
    if output == 1:
        fields = [
            "."
            for x in range(max(g1.chrom_col, g1.start_col, g1.end_col) + 1)
        ]
        for chrom, tree in clusters.items():
            for start, end, lines in tree.getregions():
                fields[g1.chrom_col] = chrom
                fields[g1.start_col] = str(start)
                fields[g1.end_col] = str(end)
                out_file.write("%s\n" % "\t".join(fields))

    # If "filtered" we preserve order of file and comments, etc.
    if output == 2:
        linenums = dict()
        for chrom, tree in clusters.items():
            for linenum in tree.getlines():
                linenums[linenum] = 0
        linenum = -1
        f1.seek(0)
        for line in f1.readlines():
            linenum += 1
            if linenum in linenums or linenum in extra:
                out_file.write("%s\n" % line.rstrip("\n\r"))

    # If "clustered" we output original intervals, but near each other (i.e. clustered)
    if output == 3:
        linenums = list()
        f1.seek(0)
        fileLines = f1.readlines()
        for chrom, tree in clusters.items():
            for linenum in tree.getlines():
                out_file.write("%s\n" % fileLines[linenum].rstrip("\n\r"))

    # If "minimum" we output the smallest interval in each cluster
    if output == 4 or output == 5:
        linenums = list()
        f1.seek(0)
        fileLines = f1.readlines()
        for chrom, tree in clusters.items():
            for start, end, lines in tree.getregions():
                outsize = -1
                outinterval = None
                for line in lines:
                    # three nested for loops?
                    # should only execute this code once per line
                    fileline = fileLines[line].rstrip("\n\r")
                    try:
                        cluster_interval = GenomicInterval(
                            g1, fileline.split("\t"), g1.chrom_col,
                            g1.start_col, g1.end_col, g1.strand_col,
                            g1.default_strand, g1.fix_strand)
                    except Exception as exc:
                        print(str(exc), file=sys.stderr)
                        f1.close()
                        sys.exit()
                    interval_size = cluster_interval.end - cluster_interval.start
                    if outsize == -1 or \
                       ( outsize > interval_size and output == 4 ) or \
                       ( outsize < interval_size and output == 5 ):
                        outinterval = cluster_interval
                        outsize = interval_size
                out_file.write("%s\n" % outinterval)

    f1.close()
    out_file.close()

    if g1.skipped > 0:
        print(skipped(g1, filedesc=""))