Ejemplo n.º 1
0
def __main__():

    options, args = doc_optparse.parse(__doc__)

    try:
        range_file = file(args[0])
        nib_file = file(args[1])
    except:
        doc_optparse.exit()

    nib = bx.seq.nib.NibFile(nib_file)

    for line in range_file:
        fields = line.split()
        start, end = int(fields[0]), int(fields[1])
        print ">", start, end
        print_wrapped(nib.get(start, end - start))
def __main__():

    options, args = doc_optparse.parse( __doc__ )

    try:
        range_file = file( args[0] )
        nib_file = file( args[1] )
    except:
        doc_optparse.exit()

    nib = bx.seq.nib.NibFile( nib_file )

    for line in range_file: 
        fields = line.split()
        start, end = int( fields[0] ), int( fields[1] ) 
        print ">", start, end 
        print_wrapped( nib.get( start, end - start ) )
def __main__():

    options, args = doc_optparse.parse( __doc__ )

    try:
        nib_dir = args[0] 
    except:
        doc_optparse.exit()

    nibs = {}

    for line in sys.stdin: 
        fields = line.split()
        chrom, start, end = fields[0], int( fields[1] ), int( fields[2] ) 
        print ">", chrom, start, end 
        if chrom in nibs:
            nib = nibs[chrom]
        else:
            nibs[chrom] = nib = bx.seq.nib.NibFile( file( "%s/%s.nib" % ( nib_dir, chrom ) ) )
        print_wrapped( nib.get( start, end - start ) )
Ejemplo n.º 4
0
def __main__():

    options, args = doc_optparse.parse( __doc__ )

    try:
        nib_dir = args[0] 
    except:
        doc_optparse.exit()

    nibs = {}

    for line in sys.stdin: 
        fields = line.split()
        chrom, start, end = fields[0], int( fields[1] ), int( fields[2] ) 
        print ">", chrom, start, end 
        if chrom in nibs:
            nib = nibs[chrom]
        else:
            nibs[chrom] = nib = bx.seq.nib.NibFile( file( "%s/%s.nib" % ( nib_dir, chrom ) ) )
        print_wrapped( nib.get( start, end - start ) )
def __main__():

    options, args = cookbook.doc_optparse.parse( __doc__ )

    try:
        nib_dir = args[0] 
    except:
        cookbook.doc_optparse.exit()

    nibs = {}

    for line in sys.stdin: 
        fields = line.split()
        chrom, start, end = fields[0], int( fields[1] ), int( fields[2] ) 
        if chrom in nibs:
            nib = nibs[chrom]
        else:
            nibs[chrom] = nib = bx.seq.nib.NibFile( file( "%s/%s.nib" % ( nib_dir, chrom ) ) )
        seq = nib.get( start, end - start ).upper()
        n = 0
        for i in range( 0, len( seq ) - 1 ):
            if seq[i] == 'C' and seq[i+1] == 'G':
                n += 1
        print n / len( seq )        
Ejemplo n.º 6
0
                if strand not in ['+', '-']:
                    strand = '+'
                sequence = ''
            else:
                continue

        # Open sequence file and get sequence for feature/interval.
        if seq_path and os.path.exists("%s/%s.nib" % (seq_path, chrom)):
            # TODO: improve support for GFF-nib interaction.
            if chrom in nibs:
                nib = nibs[chrom]
            else:
                nibs[chrom] = nib = bx.seq.nib.NibFile(
                    file("%s/%s.nib" % (seq_path, chrom)))
            try:
                sequence = nib.get(start, end - start)
            except Exception, e:
                warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (
                    start, end - start, dbkey)
                warnings.append(warning)
                if not invalid_lines:
                    invalid_lines = get_lines(feature)
                    first_invalid_line = line_count
                skipped_lines += len(invalid_lines)
                continue
        elif seq_path and os.path.isfile(seq_path):
            if not (twobitfile):
                twobitfile = bx.seq.twobit.TwoBitFile(file(seq_path))
            try:
                if options.gff and interpret_features:
                    # Create sequence from intervals within a feature.
def __main__():
    #
    # Parse options, args.
    #
    options, args = doc_optparse.parse(__doc__)
    try:
        if len(options.cols.split(',')) == 5:
            # BED file
            chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg(
                options.cols)
        else:
            # gff file
            chrom_col, start_col, end_col, strand_col = parse_cols_arg(
                options.cols)
            name_col = False
        dbkey = options.dbkey
        output_format = options.output_format
        gff_format = options.gff
        interpret_features = options.interpret_features
        GALAXY_DATA_INDEX_DIR = options.GALAXY_DATA_INDEX_DIR
        fasta_file = options.fasta
        input_filename, output_filename = args
    except:
        doc_optparse.exception()

    includes_strand_col = strand_col >= 0
    strand = None
    nibs = {}

    #
    # Set path to sequence data.
    #
    if fasta_file:
        # Need to create 2bit file from fasta file.
        try:
            seq_path = tempfile.NamedTemporaryFile(dir=".").name
            cmd = "faToTwoBit %s %s" % (fasta_file, seq_path)

            tmp_name = tempfile.NamedTemporaryFile(dir=".").name
            tmp_stderr = open(tmp_name, 'wb')
            proc = subprocess.Popen(args=cmd,
                                    shell=True,
                                    stderr=tmp_stderr.fileno())
            returncode = proc.wait()
            tmp_stderr.close()

            # Get stderr, allowing for case where it's very large.
            tmp_stderr = open(tmp_name, 'rb')
            stderr = ''
            buffsize = 1048576
            try:
                while True:
                    stderr += tmp_stderr.read(buffsize)
                    if not stderr or len(stderr) % buffsize != 0:
                        break
            except OverflowError:
                pass
            tmp_stderr.close()

            # Error checking.
            if returncode != 0:
                raise Exception(stderr)
        except Exception as e:
            stop_err('Error running faToTwoBit. ' + str(e))
    else:
        seq_path = check_seq_file(dbkey, GALAXY_DATA_INDEX_DIR)
        if not os.path.exists(seq_path):
            # If this occurs, we need to fix the metadata validator.
            stop_err(
                "No sequences are available for '%s', request them by reporting this error."
                % dbkey)

    #
    # Fetch sequences.
    #

    # Get feature's line(s).
    def get_lines(feature):
        if isinstance(feature, gff_util.GFFFeature):
            return feature.lines()
        else:
            return [feature.rstrip('\r\n')]

    skipped_lines = 0
    first_invalid_line = 0
    invalid_lines = []
    fout = open(output_filename, "w")
    warnings = []
    warning = ''
    twobitfile = None
    file_iterator = open(input_filename)
    if gff_format and interpret_features:
        file_iterator = gff_util.GFFReaderWrapper(file_iterator,
                                                  fix_strand=False)
    line_count = 1
    for feature in file_iterator:
        # Ignore comments, headers.
        if isinstance(feature, (Header, Comment)):
            line_count += 1
            continue

        name = ""
        if gff_format and interpret_features:
            # Processing features.
            gff_util.convert_gff_coords_to_bed(feature)
            chrom = feature.chrom
            start = feature.start
            end = feature.end
            strand = feature.strand
        else:
            # Processing lines, either interval or GFF format.
            line = feature.rstrip('\r\n')
            if line and not line.startswith("#"):
                fields = line.split('\t')
                try:
                    chrom = fields[chrom_col]
                    start = int(fields[start_col])
                    end = int(fields[end_col])
                    if name_col:
                        name = fields[name_col]
                    if gff_format:
                        start, end = gff_util.convert_gff_coords_to_bed(
                            [start, end])
                    if includes_strand_col:
                        strand = fields[strand_col]
                except:
                    warning = "Invalid chrom, start or end column values. "
                    warnings.append(warning)
                    if not invalid_lines:
                        invalid_lines = get_lines(feature)
                        first_invalid_line = line_count
                    skipped_lines += len(invalid_lines)
                    continue
                if start > end:
                    warning = "Invalid interval, start '%d' > end '%d'.  " % (
                        start, end)
                    warnings.append(warning)
                    if not invalid_lines:
                        invalid_lines = get_lines(feature)
                        first_invalid_line = line_count
                    skipped_lines += len(invalid_lines)
                    continue

                if strand not in ['+', '-']:
                    strand = '+'
                sequence = ''
            else:
                continue

        # Open sequence file and get sequence for feature/interval.
        if seq_path and os.path.exists("%s/%s.nib" % (seq_path, chrom)):
            # TODO: improve support for GFF-nib interaction.
            if chrom in nibs:
                nib = nibs[chrom]
            else:
                nibs[chrom] = nib = bx.seq.nib.NibFile(
                    open("%s/%s.nib" % (seq_path, chrom)))
            try:
                sequence = nib.get(start, end - start)
            except Exception as e:
                warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (
                    start, end - start, dbkey)
                warnings.append(warning)
                if not invalid_lines:
                    invalid_lines = get_lines(feature)
                    first_invalid_line = line_count
                skipped_lines += len(invalid_lines)
                continue
        elif seq_path and os.path.isfile(seq_path):
            if not (twobitfile):
                twobitfile = bx.seq.twobit.TwoBitFile(open(seq_path))
            try:
                if options.gff and interpret_features:
                    # Create sequence from intervals within a feature.
                    sequence = ''
                    for interval in feature.intervals:
                        sequence += twobitfile[
                            interval.chrom][interval.start:interval.end]
                else:
                    sequence = twobitfile[chrom][start:end]
            except:
                warning = "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " % (
                    start, end - start, chrom)
                warnings.append(warning)
                if not invalid_lines:
                    invalid_lines = get_lines(feature)
                    first_invalid_line = line_count
                skipped_lines += len(invalid_lines)
                continue
        else:
            warning = "Chromosome by name '%s' was not found for build '%s'. " % (
                chrom, dbkey)
            warnings.append(warning)
            if not invalid_lines:
                invalid_lines = get_lines(feature)
                first_invalid_line = line_count
            skipped_lines += len(invalid_lines)
            continue
        if sequence == '':
            warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " % \
                ( chrom, start, end, dbkey )
            warnings.append(warning)
            if not invalid_lines:
                invalid_lines = get_lines(feature)
                first_invalid_line = line_count
            skipped_lines += len(invalid_lines)
            continue
        if includes_strand_col and strand == "-":
            sequence = reverse_complement(sequence)

        if output_format == "fasta":
            l = len(sequence)
            c = 0
            if gff_format:
                start, end = gff_util.convert_bed_coords_to_gff([start, end])
            fields = [dbkey, str(chrom), str(start), str(end), strand]
            meta_data = "_".join(fields)
            if name.strip():
                fout.write(">%s %s\n" % (meta_data, name))
            else:
                fout.write(">%s\n" % meta_data)
            while c < l:
                b = min(c + 50, l)
                fout.write("%s\n" % str(sequence[c:b]))
                c = b
        else:  # output_format == "interval"
            if gff_format and interpret_features:
                # TODO: need better GFF Reader to capture all information needed
                # to produce this line.
                meta_data = "\t".join([
                    feature.chrom, "galaxy_extract_genomic_dna", "interval",
                    str(feature.start),
                    str(feature.end), feature.score, feature.strand, ".",
                    gff_util.gff_attributes_to_str(feature.attributes, "GTF")
                ])
            else:
                meta_data = "\t".join(fields)
            if gff_format:
                format_str = "%s seq \"%s\";\n"
            else:
                format_str = "%s\t%s\n"
            fout.write(format_str % (meta_data, str(sequence)))

        # Update line count.
        if isinstance(feature, gff_util.GFFFeature):
            line_count += len(feature.intervals)
        else:
            line_count += 1

    fout.close()

    if warnings:
        warn_msg = "%d warnings, 1st is: " % len(warnings)
        warn_msg += warnings[0]
        print warn_msg
    if skipped_lines:
        # Error message includes up to the first 10 skipped lines.
        print 'Skipped %d invalid lines, 1st is #%d, "%s"' % (
            skipped_lines, first_invalid_line, '\n'.join(invalid_lines[:10]))

    # Clean up temp file.
    if fasta_file:
        os.remove(seq_path)
        os.remove(tmp_name)
Ejemplo n.º 8
0
def __main__():
    options, args = doc_optparse.parse( __doc__ )
    try:
        chrom_col, start_col, end_col, strand_col = parse_cols_arg( options.cols )
        dbkey = options.dbkey
        output_format = options.output_format
        GALAXY_DATA_INDEX_DIR = options.GALAXY_DATA_INDEX_DIR
        input_filename, output_filename = args
    except:
        doc_optparse.exception()

    includes_strand_col = strand_col >= 0
    strand = None
    nibs = {}
    twobits = {}
    seq_path = check_seq_file( dbkey, GALAXY_DATA_INDEX_DIR )
    if not os.path.exists( seq_path ):
        # If this occurs, we need to fix the metadata validator.
        stop_err( "No sequences are available for '%s', request them by reporting this error." % dbkey )

    skipped_lines = 0
    first_invalid_line = 0
    invalid_line = ''
    fout = open( output_filename, "w" )
    warnings = []
    warning = ''
    twobitfile = None
     
    for i, line in enumerate( open( input_filename ) ):
        line = line.rstrip( '\r\n' )
        if line and not line.startswith( "#" ):
            fields = line.split( '\t' )
            try:
                chrom = fields[chrom_col]
                start = int( fields[start_col] )
                end = int( fields[end_col] )
                if includes_strand_col:
                    strand = fields[strand_col]
            except:
                warning = "Invalid chrom, start or end column values. "
                warnings.append( warning )
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
                continue
            if start > end:
                warning = "Invalid interval, start '%d' > end '%d'.  " % ( start, end )
                warnings.append( warning )
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
                continue

            if strand not in ['+', '-']:
                strand = '+'
            sequence = ''

            if seq_path and os.path.exists( "%s/%s.nib" % ( seq_path, chrom ) ):
                if chrom in nibs:
                    nib = nibs[chrom]
                else:
                    nibs[chrom] = nib = bx.seq.nib.NibFile( file( "%s/%s.nib" % ( seq_path, chrom ) ) )
                try:
                    sequence = nib.get( start, end-start )
                except:
                    warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " %( start, end-start, dbkey )
                    warnings.append( warning )
                    skipped_lines += 1
                    if not invalid_line:
                        first_invalid_line = i + 1
                        invalid_line = line
                    continue
            elif seq_path and os.path.isfile( seq_path ):
                if not(twobitfile):
                    twobitfile = bx.seq.twobit.TwoBitFile( file( seq_path ) )
                try:
                    sequence = twobitfile[chrom][start:end]
                except:
                    warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " %( start, end-start, dbkey )
                    warnings.append( warning )
                    skipped_lines += 1
                    if not invalid_line:
                        first_invalid_line = i + 1
                        invalid_line = line
                    continue
            else:
                warning = "Chromosome by name '%s' was not found for build '%s'. " % ( chrom, dbkey )
                warnings.append( warning )
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
                continue
            if sequence == '':
                warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " %( chrom, start, end, dbkey )
                warnings.append( warning )
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
                continue
            if includes_strand_col and strand == "-":
                sequence = reverse_complement( sequence )

            if output_format == "fasta" :
                l = len( sequence )        
                c = 0
                fields = [dbkey, str( chrom ), str( start ), str( end ), strand]
                meta_data = "_".join( fields )
                fout.write( ">%s\n" % meta_data )
                while c < l:
                    b = min( c + 50, l )
                    fout.write( "%s\n" % str( sequence[c:b] ) )
                    c = b
            else: # output_format == "interval"
                meta_data = "\t".join( fields )
                fout.write( "%s\t%s\n" % ( meta_data, str( sequence ) ) )

    fout.close()

    if warnings:
        warn_msg = "%d warnings, 1st is: " % len( warnings )
        warn_msg += warnings[0]
        print warn_msg
    if skipped_lines:
        print 'Skipped %d invalid lines, 1st is #%d, "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
Ejemplo n.º 9
0
def __main__():
    #
    # Parse options, args.
    #
    options, args = doc_optparse.parse( __doc__ )
    try:
        if len(options.cols.split(',')) == 5:
            # BED file
            chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg( options.cols )
        else:
            # gff file
            chrom_col, start_col, end_col, strand_col = parse_cols_arg( options.cols )
            name_col = False
        dbkey = options.dbkey
        output_format = options.output_format
        gff_format = options.gff
        interpret_features = options.interpret_features
        GALAXY_DATA_INDEX_DIR = options.GALAXY_DATA_INDEX_DIR
        fasta_file = options.fasta
        input_filename, output_filename = args
    except:
        doc_optparse.exception()

    includes_strand_col = strand_col >= 0
    strand = None
    nibs = {}

    #
    # Set path to sequence data.
    #
    if fasta_file:
        # Need to create 2bit file from fasta file.
        try:
            seq_path = tempfile.NamedTemporaryFile( dir="." ).name
            cmd = "faToTwoBit %s %s" % ( fasta_file, seq_path )

            tmp_name = tempfile.NamedTemporaryFile( dir="." ).name
            tmp_stderr = open( tmp_name, 'wb' )
            proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() )
            returncode = proc.wait()
            tmp_stderr.close()

            # Get stderr, allowing for case where it's very large.
            tmp_stderr = open( tmp_name, 'rb' )
            stderr = ''
            buffsize = 1048576
            try:
                while True:
                    stderr += tmp_stderr.read( buffsize )
                    if not stderr or len( stderr ) % buffsize != 0:
                        break
            except OverflowError:
                pass
            tmp_stderr.close()

            # Error checking.
            if returncode != 0:
                raise Exception(stderr)
        except Exception as e:
            stop_err( 'Error running faToTwoBit. ' + str( e ) )
    else:
        seq_path = check_seq_file( dbkey, GALAXY_DATA_INDEX_DIR )
        if not os.path.exists( seq_path ):
            # If this occurs, we need to fix the metadata validator.
            stop_err( "No sequences are available for '%s', request them by reporting this error." % dbkey )

    #
    # Fetch sequences.
    #

    # Get feature's line(s).
    def get_lines( feature ):
        if isinstance( feature, gff_util.GFFFeature ):
            return feature.lines()
        else:
            return [ feature.rstrip( '\r\n' ) ]

    skipped_lines = 0
    first_invalid_line = 0
    invalid_lines = []
    fout = open( output_filename, "w" )
    warnings = []
    warning = ''
    twobitfile = None
    file_iterator = open( input_filename )
    if gff_format and interpret_features:
        file_iterator = gff_util.GFFReaderWrapper( file_iterator, fix_strand=False )
    line_count = 1
    for feature in file_iterator:
        # Ignore comments, headers.
        if isinstance( feature, ( Header, Comment ) ):
            line_count += 1
            continue

        name = ""
        if gff_format and interpret_features:
            # Processing features.
            gff_util.convert_gff_coords_to_bed( feature )
            chrom = feature.chrom
            start = feature.start
            end = feature.end
            strand = feature.strand
        else:
            # Processing lines, either interval or GFF format.
            line = feature.rstrip( '\r\n' )
            if line and not line.startswith( "#" ):
                fields = line.split( '\t' )
                try:
                    chrom = fields[chrom_col]
                    start = int( fields[start_col] )
                    end = int( fields[end_col] )
                    if name_col:
                        name = fields[name_col]
                    if gff_format:
                        start, end = gff_util.convert_gff_coords_to_bed( [start, end] )
                    if includes_strand_col:
                        strand = fields[strand_col]
                except:
                    warning = "Invalid chrom, start or end column values. "
                    warnings.append( warning )
                    if not invalid_lines:
                        invalid_lines = get_lines( feature )
                        first_invalid_line = line_count
                    skipped_lines += len( invalid_lines )
                    continue
                if start > end:
                    warning = "Invalid interval, start '%d' > end '%d'.  " % ( start, end )
                    warnings.append( warning )
                    if not invalid_lines:
                        invalid_lines = get_lines( feature )
                        first_invalid_line = line_count
                    skipped_lines += len( invalid_lines )
                    continue

                if strand not in ['+', '-']:
                    strand = '+'
                sequence = ''
            else:
                continue

        # Open sequence file and get sequence for feature/interval.
        if seq_path and os.path.exists( "%s/%s.nib" % ( seq_path, chrom ) ):
            # TODO: improve support for GFF-nib interaction.
            if chrom in nibs:
                nib = nibs[chrom]
            else:
                nibs[chrom] = nib = bx.seq.nib.NibFile( open( "%s/%s.nib" % ( seq_path, chrom ) ) )
            try:
                sequence = nib.get( start, end - start )
            except Exception as e:
                warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % ( start, end - start, dbkey )
                warnings.append( warning )
                if not invalid_lines:
                    invalid_lines = get_lines( feature )
                    first_invalid_line = line_count
                skipped_lines += len( invalid_lines )
                continue
        elif seq_path and os.path.isfile( seq_path ):
            if not(twobitfile):
                twobitfile = bx.seq.twobit.TwoBitFile( open( seq_path ) )
            try:
                if options.gff and interpret_features:
                    # Create sequence from intervals within a feature.
                    sequence = ''
                    for interval in feature.intervals:
                        sequence += twobitfile[interval.chrom][interval.start:interval.end]
                else:
                    sequence = twobitfile[chrom][start:end]
            except:
                warning = "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " % ( start, end - start, chrom )
                warnings.append( warning )
                if not invalid_lines:
                    invalid_lines = get_lines( feature )
                    first_invalid_line = line_count
                skipped_lines += len( invalid_lines )
                continue
        else:
            warning = "Chromosome by name '%s' was not found for build '%s'. " % ( chrom, dbkey )
            warnings.append( warning )
            if not invalid_lines:
                invalid_lines = get_lines( feature )
                first_invalid_line = line_count
            skipped_lines += len( invalid_lines )
            continue
        if sequence == '':
            warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " % \
                ( chrom, start, end, dbkey )
            warnings.append( warning )
            if not invalid_lines:
                invalid_lines = get_lines( feature )
                first_invalid_line = line_count
            skipped_lines += len( invalid_lines )
            continue
        if includes_strand_col and strand == "-":
            sequence = reverse_complement( sequence )

        if output_format == "fasta":
            l = len( sequence )
            c = 0
            if gff_format:
                start, end = gff_util.convert_bed_coords_to_gff( [ start, end ] )
            fields = [dbkey, str( chrom ), str( start ), str( end ), strand]
            meta_data = "_".join( fields )
            if name.strip():
                fout.write( ">%s %s\n" % (meta_data, name) )
            else:
                fout.write( ">%s\n" % meta_data )
            while c < l:
                b = min( c + 50, l )
                fout.write( "%s\n" % str( sequence[c:b] ) )
                c = b
        else:  # output_format == "interval"
            if gff_format and interpret_features:
                # TODO: need better GFF Reader to capture all information needed
                # to produce this line.
                meta_data = "\t".join(
                    [feature.chrom, "galaxy_extract_genomic_dna", "interval",
                    str( feature.start ), str( feature.end ), feature.score, feature.strand,
                    ".", gff_util.gff_attributes_to_str( feature.attributes, "GTF" ) ] )
            else:
                meta_data = "\t".join( fields )
            if gff_format:
                format_str = "%s seq \"%s\";\n"
            else:
                format_str = "%s\t%s\n"
            fout.write( format_str % ( meta_data, str( sequence ) ) )

        # Update line count.
        if isinstance( feature, gff_util.GFFFeature ):
            line_count += len( feature.intervals )
        else:
            line_count += 1

    fout.close()

    if warnings:
        warn_msg = "%d warnings, 1st is: " % len( warnings )
        warn_msg += warnings[0]
        print(warn_msg)
    if skipped_lines:
        # Error message includes up to the first 10 skipped lines.
        print('Skipped %d invalid lines, 1st is #%d, "%s"' % ( skipped_lines, first_invalid_line, '\n'.join( invalid_lines[:10] ) ))

    # Clean up temp file.
    if fasta_file:
        os.remove( seq_path )
        os.remove( tmp_name )
Ejemplo n.º 10
0
                if strand not in ['+', '-']:
                    strand = '+'
                sequence = ''
            else:
                continue

        # Open sequence file and get sequence for feature/interval. 
        if seq_path and os.path.exists( "%s/%s.nib" % ( seq_path, chrom ) ):
            # TODO: improve support for GFF-nib interaction.
            if chrom in nibs:
                nib = nibs[chrom]
            else:
                nibs[chrom] = nib = bx.seq.nib.NibFile( file( "%s/%s.nib" % ( seq_path, chrom ) ) )
            try:
                sequence = nib.get( start, end-start )
            except Exception, e:
                warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " %( start, end-start, dbkey )
                warnings.append( warning )
                if not invalid_lines:
                    invalid_lines = get_lines( feature )
                    first_invalid_line = line_count
                skipped_lines += len( invalid_lines )
                continue
        elif seq_path and os.path.isfile( seq_path ):
            if not(twobitfile):
                twobitfile = bx.seq.twobit.TwoBitFile( file( seq_path ) )
            try:
                if options.gff and interpret_features:
                    # Create sequence from intervals within a feature.
                    sequence = ''
Ejemplo n.º 11
0
def __main__():

    lflank = 0
    rflank = 0

    options, args = doc_optparse.parse(__doc__)
    try:
        chrom_col, start_col, end_col, strand_col = parse_cols_arg(
            options.cols)
        output_format = options.output_format
        seq_path = options.seq_path
        if (options.left_flank): lflank = int(options.left_flank)
        if (options.right_flank): rflank = int(options.right_flank)
        input_filename, output_filename = args
    except:
        doc_optparse.exception()
    includes_strand_col = strand_col >= 0
    strand = None
    nibs = {}
    twobits = {}
    if not os.path.exists(seq_path):
        # If this occurs, we need to fix the metadata validator.
        print "No sequences are available for '%s', request them by reporting this error."

    skipped_lines = 0
    first_invalid_line = 0
    invalid_line = ''
    fout = open(output_filename, "w")
    warnings = []
    warning = ''
    twobitfile = None
    dbkey = seq_path

    for i, line in enumerate(open(input_filename)):
        line = line.rstrip('\r\n')
        if line and not line.startswith("#"):
            fields = line.split('\t')
            try:
                chrom = fields[chrom_col]
                ostart = int(fields[start_col])
                oend = int(fields[end_col])
                start = ostart - lflank
                end = oend + rflank
                if includes_strand_col:
                    strand = fields[strand_col]
            except:
                warning = "Invalid chrom, start or end column values. "
                warnings.append(warning)
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
                continue
            if start > end:
                warning = "Invalid interval, start '%d' > end '%d'.  " % (
                    start, end)
                warnings.append(warning)
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
                continue

            if strand not in ['+', '-']:
                strand = '+'
            sequence = ''

            if seq_path and os.path.exists("%s/%s.nib" % (seq_path, chrom)):
                if chrom in nibs:
                    nib = nibs[chrom]
                else:
                    nibs[chrom] = nib = bx.seq.nib.NibFile(
                        file("%s/%s.nib" % (seq_path, chrom)))
                try:
                    sequence = nib.get(start, end - start)
                except:
                    warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (
                        start, end - start, dbkey)
                    warnings.append(warning)
                    skipped_lines += 1
                    if not invalid_line:
                        first_invalid_line = i + 1
                        invalid_line = line
                    continue
            elif seq_path and os.path.isfile(seq_path):
                if not (twobitfile):
                    twobitfile = bx.seq.twobit.TwoBitFile(file(seq_path))
                try:
                    sequence = twobitfile[chrom][start:end]
                except:
                    warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (
                        start, end - start, dbkey)
                    warnings.append(warning)
                    skipped_lines += 1
                    if not invalid_line:
                        first_invalid_line = i + 1
                        invalid_line = line
                    continue
            else:
                warning = "Chromosome by name '%s' was not found for build '%s'. " % (
                    chrom, dbkey)
                warnings.append(warning)
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
                continue
            if sequence == '':
                warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " % (
                    chrom, start, end, dbkey)
                warnings.append(warning)
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
                continue
            if includes_strand_col and strand == "-":
                sequence = reverse_complement(sequence)
            sequence = sequence[0:lflank].lower(
            ) + sequence[lflank:len(sequence) - rflank +
                         1].upper() + sequence[len(sequence) - rflank +
                                               1:len(sequence)].lower()

            if output_format == "fasta":
                l = len(sequence)
                c = 0
                fields = [dbkey, str(chrom), str(ostart), str(oend), strand]
                meta_data = "_".join(fields)
                fout.write(">%s\n" % meta_data)
                while c < l:
                    b = min(c + 50, l)
                    fout.write("%s\n" % str(sequence[c:b]))
                    c = b
            else:  # output_format == "interval"
                meta_data = "\t".join(fields)
                fout.write("%s\t%s\n" % (meta_data, str(sequence)))

    fout.close()

    if warnings:
        warn_msg = "%d warnings, 1st is: " % len(warnings)
        warn_msg += warnings[0]
        print warn_msg
    if skipped_lines:
        print 'Skipped %d invalid lines, 1st is #%d, "%s"' % (
            skipped_lines, first_invalid_line, invalid_line)