コード例 #1
0
ファイル: diff_gtf.py プロジェクト: harmeet1990/cgat-apps
    def buildIndex(self, filename):
        """read and index."""

        idx = {}
        infile = iotools.open_file(filename, "r")
        for e in GTF.readFromFile(infile):
            if e.contig not in idx:
                idx[e.contig] = NCL.NCLSimple()
            idx[e.contig].add(e.start, e.end)
        infile.close()
        return idx
コード例 #2
0
def main(argv=None):

    if not argv:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-e",
                      "--output-equivalent",
                      dest="write_equivalent",
                      action="store_true",
                      help="write equivalent entries [default=%default].")

    parser.add_option("-f",
                      "--output-full",
                      dest="write_full",
                      action="store_true",
                      help="write full gff entries [default=%default].")

    parser.add_option("-p",
                      "--add-percent",
                      dest="add_percent",
                      action="store_true",
                      help="add percentage columns [default=%default].")

    parser.add_option("-s",
                      "--ignore-strand",
                      dest="ignore_strand",
                      action="store_true",
                      help="ignore strand information [default=%default].")

    parser.set_defaults(
        write_equivalent=False,
        write_full=False,
        add_percent=False,
        ignore_strand=False,
        as_gtf=False,
    )

    (options, args) = E.start(parser, argv, add_output_options=True)

    if len(args) != 2:
        raise ValueError("two arguments required")

    input_filename1, input_filename2 = args

    # duplicated features cause a problem. Make sure
    # features are non-overlapping by running
    # gff_combine.py on GFF files first.

    E.info("reading data started")

    idx, genes2 = {}, set()
    for e in GTF.readFromFile(iotools.open_file(input_filename2, "r")):
        genes2.add(e.gene_id)
        if e.contig not in idx:
            idx[e.contig] = quicksect.IntervalTree()
        idx[e.contig].add(e.start, e.end, e)

    overlaps_genes = []

    E.info("reading data finished: %i contigs" % len(idx))

    # outfile_diff and outfile_overlap not implemented
    # outfile_diff = getFile( options, "diff" )
    # outfile_overlap = getFile( options, "overlap" )
    overlapping_genes = set()

    genes1 = set()

    # iterate over exons
    with iotools.open_file(input_filename1, "r") as infile:
        for this in GTF.iterator(infile):

            genes1.add(this.gene_id)

            try:
                intervals = idx[this.contig].find(
                    quicksect.Interval(this.start, this.end))
            except KeyError:
                continue

            others = [x.data for x in intervals]
            for other in others:
                overlapping_genes.add((this.gene_id, other.gene_id))

            # check for identical/half-identical matches
            output = None
            for other in others:
                if this.start == other.start and this.end == other.end:
                    output, symbol = other, "="
                    break
            else:
                for other in others:
                    if this.start == other.start or this.end == other.end:
                        output, symbol = other, "|"
                        break
                else:
                    symbol = "~"

    # if outfile_diff != options.stdout: outfile_diff.close()
    # if outfile_overlap != options.stdout: outfile_overlap.close()

    outfile = None
    ##################################################################
    ##################################################################
    ##################################################################
    # print gene based information
    ##################################################################
    if overlapping_genes:
        outfile = getFile(options, "genes_ovl")
        outfile.write("gene_id1\tgene_id2\n")
        for a, b in sorted(overlapping_genes):
            outfile.write("%s\t%s\n" % (a, b))
        if outfile != options.stdout:
            outfile.close()

        outfile_total = getFile(options, "genes_total")
        outfile_total.write(
            "set\tngenes\tnoverlapping\tpoverlapping\tnunique\tpunique\n")

        outfile = getFile(options, "genes_uniq1")
        b = set([x[0] for x in overlapping_genes])
        d = genes1.difference(b)
        outfile.write("gene_id1\n")
        outfile.write("\n".join(sorted(d)) + "\n")
        if outfile != options.stdout:
            outfile.close()
        outfile_total.write(
            "%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" %
            (os.path.basename(input_filename1), len(genes1), len(b),
             100.0 * len(b) / len(a), len(d), 100.0 * len(d) / len(genes1)))

        outfile = getFile(options, "genes_uniq2")
        b = set([x[1] for x in overlapping_genes])
        d = genes2.difference(b)
        outfile.write("gene_id2\n")
        outfile.write("\n".join(sorted(d)) + "\n")
        if outfile != options.stdout:
            outfile.close()

        outfile_total.write(
            "%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" %
            (os.path.basename(input_filename2), len(genes2), len(b),
             100.0 * len(b) / len(a), len(d), 100.0 * len(d) / len(genes2)))
        if outfile_total != options.stdout:
            outfile_total.close()

    E.stop()
コード例 #3
0
ファイル: gff2coverage.py プロジェクト: alphaneer/cgat-apps
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: "
                            "$Id: gff2coverage.py 2781 2009-09-10 11:33:14Z "
                            "andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default]")

    parser.add_option("-f",
                      "--features",
                      dest="features",
                      type="string",
                      action="append",
                      help="features to collect "
                      "[default=%default]")

    parser.add_option("-w",
                      "--window-size",
                      dest="window_size",
                      type="int",
                      help="window size in bp for histogram computation. "
                      "Determines the bin size.  "
                      "[default=%default]")

    parser.add_option("-b",
                      "--num-bins",
                      dest="num_bins",
                      type="int",
                      help="number of bins for histogram computation "
                      "if window size is not given. "
                      "[default=%default]")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=(
                          "genomic",
                          "histogram",
                      ),
                      help="methods to apply. "
                      "[default=%default]")

    parser.set_defaults(
        genome_file=None,
        window_size=None,
        num_bins=1000,
        value_format="%6.4f",
        features=[],
        method="genomic",
    )

    (options, args) = E.start(parser, add_output_options=True)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.method == "histogram":

        gff = GTF.readFromFile(options.stdin)

        gff.sort(key=lambda x: (x.contig, x.start))

        chunk = []
        last_contig = None

        for entry in gff:

            if last_contig != entry.contig:
                processChunk(last_contig, chunk, options, fasta)
                last_contig = entry.contig
                chunk = []

            chunk.append(entry)

        processChunk(last_contig, chunk, options, fasta)

    elif options.method == "genomic":
        intervals = collections.defaultdict(int)
        bases = collections.defaultdict(int)
        total = 0
        for entry in GTF.iterator(options.stdin):
            intervals[(entry.contig, entry.source, entry.feature)] += 1
            bases[(entry.contig, entry.source,
                   entry.feature)] += entry.end - entry.start
            total += entry.end - entry.start

        options.stdout.write("contig\tsource\tfeature\tintervals\tbases")
        if fasta:
            options.stdout.write(
                "\tpercent_coverage\ttotal_percent_coverage\n")
        else:
            options.stdout.write("\n")

        total_genome_size = sum(
            fasta.getContigSizes(with_synonyms=False).values())

        for key in sorted(intervals.keys()):
            nbases = bases[key]
            nintervals = intervals[key]
            contig, source, feature = key
            options.stdout.write("\t".join(
                ("\t".join(key), str(nintervals), str(nbases))))
            if fasta:
                options.stdout.write(
                    "\t%f" % (100.0 * float(nbases) / fasta.getLength(contig)))
                options.stdout.write(
                    "\t%f\n" % (100.0 * float(nbases) / total_genome_size))
            else:
                options.stdout.write("\n")

    E.stop()
コード例 #4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome (indexed).")

    parser.add_argument("-w",
                        "--windows-bed-file",
                        dest="filename_windows",
                        type=str,
                        help="gff file with windows to use.")

    parser.add_argument("-d",
                        "--filename-data",
                        dest="filename_data",
                        type=str,
                        help="gff file with data to use.")

    parser.add_argument("--is-gtf",
                        dest="is_gtf",
                        action="store_true",
                        help="filename-data is gtf file")

    parser.add_argument("-f",
                        "--features",
                        dest="features",
                        type=str,
                        action="append",
                        choices=("GC", ),
                        help="features to compute.")

    parser.add_argument("-c",
                        "--decorator",
                        dest="decorator",
                        type=str,
                        choices=("counts", "gc", "gc3", "mean-length",
                                 "median-length", "percent-coverage",
                                 "median-score", "mean-score", "stddev-score",
                                 "min-score", "max-score"),
                        help="decorators to use.")

    parser.add_argument("-e",
                        "--skip-empty",
                        dest="skip_empty",
                        action="store_true",
                        help="skip empty windows.")

    parser.add_argument(
        "-t",
        "--transform=",
        dest="transform",
        type=str,
        choices=("none", "overlap", "complement", "third_codon"),
        help="transform to use when mapping overlapping regions onto window.")

    parser.set_defaults(
        genome_file=None,
        filename_windows=None,
        filename_data=None,
        features=[],
        skip_empty=False,
        decorator="counts",
        transform="none",
        is_gtf=False,
    )

    (args) = E.start(parser)

    #    test_transform_third_codon()

    if not args.filename_windows:
        raise ValueError("please supply a gff file with window information.")

    if args.loglevel >= 1:
        args.stdlog.write("# reading windows...")
        args.stdlog.flush()

    windows = GTF.readAsIntervals(
        GTF.iterator(iotools.open_file(args.filename_windows, "r")))

    if args.loglevel >= 1:
        args.stdlog.write("done\n")
        args.stdlog.flush()

    if args.filename_data:
        if args.loglevel >= 1:
            args.stdlog.write("# reading data...")
            args.stdlog.flush()

        if args.is_gtf:
            gff_data = GTF.readFromFile(
                iotools.open_file(args.filename_data, "r"))
        else:
            gff_data = GTF.readFromFile(
                IOTOols.open_file(args.filename_data, "r"))

        if args.loglevel >= 1:
            args.stdlog.write("done\n")
            args.stdlog.flush()

        data_ranges = GTF.SortPerContig(gff_data)
    else:
        # use windows to compute properties
        # by supplying no data and asking for the complement = original window
        gff_data = None
        data_ranges = None
        args.transform = "complement"

    map_contig2size = {}

    if args.genome_file:
        fasta = IndexedFasta.IndexedFasta(args.genome_file)
        map_contig2size = fasta.getContigSizes()
    else:
        for contig, values in list(windows.items()):
            map_contig2size[contig] = max(lambda x: x[1], values)
        fasta = None

    contigs = list(map_contig2size.keys())
    contigs.sort()

    # proceed contig wise
    noutput_contigs, ncontigs_skipped_windows, ncontigs_skipped_data = 0, 0, 0

    args.stdout.write("\t".join(
        map(str, ("contig", "start", "end", "ngenes", "ntranscripts", "n1",
                  "l1", "n2", "l2", "score", "extra_info"))) + "\n")

    for contig in contigs:

        skip = False
        if contig not in windows:
            ncontigs_skipped_windows += 1
            skip = True

        if data_ranges and contig not in data_ranges:
            ncontigs_skipped_data += 1
            skip = True

        if skip:
            continue

        noutput_contigs += 1
        if data_ranges:
            annotateWindows(
                contig, windows[contig],
                gff_data[data_ranges[contig][0]:data_ranges[contig][1]], fasta,
                args)
        else:
            annotateWindows(contig, windows[contig], [], fasta, args)

    E.info(
        "ninput_windows=%i, noutput_contigs=%i, ninput_contigs=%i, nskipped_windows=%i, nskipped_data=%i"
        % (len(windows), noutput_contigs, len(contigs),
           ncontigs_skipped_windows, ncontigs_skipped_data))

    E.stop()