Ejemplo n.º 1
0
def __main__():
    # Get args.
    input_name = sys.argv[1]
    output_name = sys.argv[2]
    feature_name = sys.argv[3]
    condition = sys.argv[4]

    # Unescape operations in condition str.
    for key, value in mapped_ops.items():
        condition = condition.replace(key, value)

    # Error checking: condition should be of the form <operator><number>
    for op in ops:
        if op in condition:
            empty, number_str = condition.split(op)
            try:
                number = float(number_str)
            except:
                number = None
            if empty != "" or not number:
                print("Invalid condition: %s, cannot filter." % condition,
                      file=sys.stderr)
                return
            break

    # Do filtering.
    kept_features = 0
    skipped_lines = 0
    first_skipped_line = 0
    out = open(output_name, 'w')
    for i, feature in enumerate(GFFReaderWrapper(open(input_name))):
        if not isinstance(feature, GenomicInterval):
            continue
        count = 0
        for interval in feature.intervals:
            if interval.feature == feature_name:
                count += 1
        eval_text = '%s %s' % (count, condition)
        if not check_expression(eval_text):
            print("Invalid condition: %s, cannot filter." % condition,
                  file=sys.stderr)
            sys.exit(1)

        if eval(eval_text):
            # Keep feature.
            for interval in feature.intervals:
                out.write("\t".join(interval.fields) + '\n')
            kept_features += 1

    # Needed because i is 0-based but want to display stats using 1-based.
    i += 1

    # Clean up.
    out.close()
    info_msg = "%i of %i features kept (%.2f%%) using condition %s.  " % \
        (kept_features, i, float(kept_features) / i * 100.0, feature_name + condition)
    if skipped_lines > 0:
        info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." % (
            skipped_lines, first_skipped_line)
    print(info_msg)
Ejemplo n.º 2
0
def main():
    # Arguments
    input_fname, out_fname = sys.argv[1:]

    # Do conversion.
    index = Indexes()
    offset = 0
    reader_wrapper = GFFReaderWrapper(fileinput.FileInput(input_fname),
                                      fix_strand=True)
    for feature in list(reader_wrapper):
        # Add feature; index expects BED coordinates.
        if isinstance(feature, GenomicInterval):
            convert_gff_coords_to_bed(feature)
            index.add(feature.chrom, feature.start, feature.end, offset)

        # Always increment offset, even if feature is not an interval and hence
        # not included in the index.
        offset += feature.raw_size

    index.write(open(out_fname, "wb"))
Ejemplo n.º 3
0
def main():
    # Process arguments.
    parser = optparse.OptionParser()
    parser.add_option('-F', '--format', dest="input_format")
    (options, args) = parser.parse_args()
    in_fname, out_fname = args
    input_format = options.input_format.lower()

    # Create dict of name-location pairings.
    name_loc_dict = {}
    if input_format in ['gff', 'gtf']:
        # GTF/GFF format

        # Create reader.
        if input_format == 'gff':
            in_reader = GFFReaderWrapper(open(in_fname, 'r'))
        else:  #input_format == 'gtf'
            in_reader = read_unordered_gtf(open(in_fname, 'r'))

        for feature in in_reader:
            if isinstance(feature, Comment):
                continue

            for name in feature.attributes:
                val = feature.attributes[name]
                try:
                    float(val)
                    continue
                except:
                    convert_gff_coords_to_bed(feature)
                    # Value is not a number, so it can be indexed.
                    if val not in name_loc_dict:
                        # Value is not in dictionary.
                        name_loc_dict[val] = {
                            'contig': feature.chrom,
                            'start': feature.start,
                            'end': feature.end
                        }
                    else:
                        # Value already in dictionary, so update dictionary.
                        loc = name_loc_dict[val]
                        if feature.start < loc['start']:
                            loc['start'] = feature.start
                        if feature.end > loc['end']:
                            loc['end'] = feature.end
    elif input_format == 'bed':
        # BED format.
        for line in open(in_fname, 'r'):
            # Ignore track lines.
            if line.startswith("track"):
                continue

            fields = line.split()

            # Ignore lines with no feature name.
            if len(fields) < 4:
                continue

            # Process line
            name_loc_dict[fields[3]] = {
                'contig': fields[0],
                'start': int(fields[1]),
                'end': int(fields[2])
            }

    # Create sorted list of entries.
    out = open(out_fname, 'w')
    max_len = 0
    entries = []
    for name in sorted(name_loc_dict.iterkeys()):
        loc = name_loc_dict[name]
        entry = '%s\t%s\t%s' % (name.lower(), name, '%s:%i-%i' %
                                (loc['contig'], loc['start'], loc['end']))
        if len(entry) > max_len:
            max_len = len(entry)
        entries.append(entry)

    # Write padded entries.
    out.write(str(max_len + 1).ljust(max_len) + '\n')
    for entry in entries:
        out.write(entry.ljust(max_len) + '\n')
    out.close()
Ejemplo n.º 4
0
#!/usr/bin/env python
# This tool takes a gff file as input and counts the number of features in it.

import sys, fileinput
from galaxy import eggs
from galaxy.datatypes.util.gff_util import GFFReaderWrapper
from bx.intervals.io import GenomicInterval

# Get args.
input_file = sys.argv[1:]

# Count features.
count = 0
for feature in GFFReaderWrapper(fileinput.FileInput(input_file),
                                fix_strand=True):
    if isinstance(feature, GenomicInterval):
        count += 1

print count