Example #1
0
def covfilter(args):
    """
    %prog covfilter blastfile fastafile

    Fastafile is used to get the sizes of the queries. Two filters can be
    applied, the id% and cov%.
    """
    from jcvi.algorithms.supermap import supermap
    from jcvi.utils.range import range_union

    allowed_iterby = ("query", "query_sbjct")

    p = OptionParser(covfilter.__doc__)
    p.set_align(pctid=95, pctcov=50)
    p.add_option("--scov",
                 default=False,
                 action="store_true",
                 help="Subject coverage instead of query [default: %default]")
    p.add_option("--supermap",
                 action="store_true",
                 help="Use supermap instead of union")
    p.add_option("--ids",
                 dest="ids",
                 default=None,
                 help="Print out the ids that satisfy [default: %default]")
    p.add_option("--list",
                 dest="list",
                 default=False,
                 action="store_true",
                 help="List the id% and cov% per gene [default: %default]")
    p.add_option(
        "--iterby",
        dest="iterby",
        default="query",
        choices=allowed_iterby,
        help="Choose how to iterate through BLAST [default: %default]")
    p.set_outfile(outfile=None)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, fastafile = args
    pctid = opts.pctid
    pctcov = opts.pctcov
    union = not opts.supermap
    scov = opts.scov
    sz = Sizes(fastafile)
    sizes = sz.mapping
    iterby = opts.iterby
    qspair = iterby == "query_sbjct"

    if not union:
        querysupermap = blastfile + ".query.supermap"
        if not op.exists(querysupermap):
            supermap(blastfile, filter="query")

        blastfile = querysupermap

    assert op.exists(blastfile)

    covered = 0
    mismatches = 0
    gaps = 0
    alignlen = 0
    queries = set()
    valid = set()
    blast = BlastSlow(blastfile)
    iterator = blast.iter_hits_pair if qspair else blast.iter_hits

    covidstore = {}
    for query, blines in iterator():
        blines = list(blines)
        queries.add(query)

        # per gene report
        this_covered = 0
        this_alignlen = 0
        this_mismatches = 0
        this_gaps = 0
        this_identity = 0

        ranges = []
        for b in blines:
            if scov:
                s, start, stop = b.subject, b.sstart, b.sstop
            else:
                s, start, stop = b.query, b.qstart, b.qstop
            cov_id = s

            if b.pctid < pctid:
                continue

            if start > stop:
                start, stop = stop, start
            this_covered += stop - start + 1
            this_alignlen += b.hitlen
            this_mismatches += b.nmismatch
            this_gaps += b.ngaps
            ranges.append(("1", start, stop))

        if ranges:
            this_identity = 100. - (this_mismatches +
                                    this_gaps) * 100. / this_alignlen

        if union:
            this_covered = range_union(ranges)

        this_coverage = this_covered * 100. / sizes[cov_id]
        covidstore[query] = (this_identity, this_coverage)
        if this_identity >= pctid and this_coverage >= pctcov:
            valid.add(query)

        covered += this_covered
        mismatches += this_mismatches
        gaps += this_gaps
        alignlen += this_alignlen

    if opts.list:
        if qspair:
            allpairs = defaultdict(list)
            for (q, s) in covidstore:
                allpairs[q].append((q, s))
                allpairs[s].append((q, s))

            for id, size in sz.iter_sizes():
                if id not in allpairs:
                    print "\t".join((id, "na", "0", "0"))
                else:
                    for qs in allpairs[id]:
                        this_identity, this_coverage = covidstore[qs]
                        print "{0}\t{1:.1f}\t{2:.1f}".format(
                            "\t".join(qs), this_identity, this_coverage)
        else:
            for query, size in sz.iter_sizes():
                this_identity, this_coverage = covidstore.get(query, (0, 0))
                print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity,
                                                     this_coverage)

    mapped_count = len(queries)
    valid_count = len(valid)
    cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts)

    m = "Identity: {0} mismatches, {1} gaps, {2} alignlen\n".\
            format(mismatches, gaps, alignlen)
    total = len(sizes.keys())
    m += "Total mapped: {0} ({1:.1f}% of {2})\n".\
            format(mapped_count, mapped_count * 100. / total, total)
    m += "Total valid {0}: {1} ({2:.1f}% of {3})\n".\
            format(cutoff_message, valid_count, valid_count * 100. / total, total)
    m += "Average id = {0:.2f}%\n".\
            format(100 - (mismatches + gaps) * 100. / alignlen)

    queries_combined = sz.totalsize
    m += "Coverage: {0} covered, {1} total\n".\
            format(covered, queries_combined)
    m += "Average coverage = {0:.2f}%".\
            format(covered * 100. / queries_combined)

    logfile = blastfile + ".covfilter.log"
    fw = open(logfile, "w")
    for f in (sys.stderr, fw):
        print >> f, m
    fw.close()

    if opts.ids:
        filename = opts.ids
        fw = must_open(filename, "w")
        for id in valid:
            print >> fw, id
        logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\
                format(cutoff_message, filename))

    outfile = opts.outfile
    if not outfile:
        return

    fw = must_open(outfile, "w")
    blast = Blast(blastfile)
    for b in blast:
        query = (b.query, b.subject) if qspair else b.query
        if query in valid:
            print >> fw, b
Example #2
0
def covfilter(args):
    """
    %prog covfilter blastfile fastafile

    Fastafile is used to get the sizes of the queries. Two filters can be
    applied, the id% and cov%.
    """
    p = OptionParser(covfilter.__doc__)
    p.add_option("--pctid", dest="pctid", default=90, type="int",
            help="Percentage identity cutoff [default: %default]")
    p.add_option("--pctcov", dest="pctcov", default=50, type="int",
            help="Percentage identity cutoff [default: %default]")
    p.add_option("--ids", dest="ids", default=None,
            help="Print out the ids that satisfy [default: %default]")
    p.add_option("--list", dest="list", default=False, action="store_true",
            help="List the id% and cov% per gene [default: %default]")
    set_outfile(p, outfile=None)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    from jcvi.algorithms.supermap import supermap

    blastfile, fastafile = args
    sizes = Sizes(fastafile).mapping
    querysupermap = blastfile + ".query.supermap"
    if not op.exists(querysupermap):
        supermap(blastfile, filter="query")

    blastfile = querysupermap
    assert op.exists(blastfile)

    covered = 0
    mismatches = 0
    gaps = 0
    alignlen = 0
    queries = set()
    valid = set()
    blast = BlastSlow(querysupermap)
    for query, blines in blast.iter_hits():
        blines = list(blines)
        queries.add(query)

        # per gene report
        this_covered = 0
        this_alignlen = 0
        this_mismatches = 0
        this_gaps = 0

        for b in blines:
            this_covered += abs(b.qstart - b.qstop + 1)
            this_alignlen += b.hitlen
            this_mismatches += b.nmismatch
            this_gaps += b.ngaps

        this_identity = 100. - (this_mismatches + this_gaps) * 100. / this_alignlen
        this_coverage = this_covered * 100. / sizes[query]

        if opts.list:
            print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity, this_coverage)

        if this_identity >= opts.pctid and this_coverage >= opts.pctcov:
            valid.add(query)

        covered += this_covered
        mismatches += this_mismatches
        gaps += this_gaps
        alignlen += this_alignlen

    mapped_count = len(queries)
    valid_count = len(valid)
    cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts)

    print >> sys.stderr, "Identity: {0} mismatches, {1} gaps, {2} alignlen".\
            format(mismatches, gaps, alignlen)
    total = len(sizes.keys())
    print >> sys.stderr, "Total mapped: {0} ({1:.1f}% of {2})".\
            format(mapped_count, mapped_count * 100. / total, total)
    print >> sys.stderr, "Total valid {0}: {1} ({2:.1f}% of {3})".\
            format(cutoff_message, valid_count, valid_count * 100. / total, total)
    print >> sys.stderr, "Average id = {0:.2f}%".\
            format(100 - (mismatches + gaps) * 100. / alignlen)

    queries_combined = sum(sizes[x] for x in queries)
    print >> sys.stderr, "Coverage: {0} covered, {1} total".\
            format(covered, queries_combined)
    print >> sys.stderr, "Average coverage = {0:.2f}%".\
            format(covered * 100. / queries_combined)

    if opts.ids:
        filename = opts.ids
        fw = must_open(filename, "w")
        for id in valid:
            print >> fw, id
        logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\
                format(cutoff_message, filename))

    outfile = opts.outfile
    if not outfile:
        return

    fp = open(blastfile)
    fw = must_open(outfile, "w")
    blast = Blast(blastfile)
    for b in blast.iter_line():
        if b.query in valid:
            print >> fw, b
Example #3
0
def covfilter(args):
    """
    %prog covfilter blastfile fastafile

    Fastafile is used to get the sizes of the queries. Two filters can be
    applied, the id% and cov%.
    """
    from jcvi.algorithms.supermap import supermap
    from jcvi.utils.range import range_union

    allowed_iterby = ("query", "query_sbjct")

    p = OptionParser(covfilter.__doc__)
    p.set_align(pctid=95, pctcov=50)
    p.add_option("--scov", default=False, action="store_true",
            help="Subject coverage instead of query [default: %default]")
    p.add_option("--supermap", action="store_true",
            help="Use supermap instead of union")
    p.add_option("--ids", dest="ids", default=None,
            help="Print out the ids that satisfy [default: %default]")
    p.add_option("--list", dest="list", default=False, action="store_true",
            help="List the id% and cov% per gene [default: %default]")
    p.add_option("--iterby", dest="iterby", default="query", choices=allowed_iterby,
            help="Choose how to iterate through BLAST [default: %default]")
    p.set_outfile(outfile=None)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, fastafile = args
    pctid = opts.pctid
    pctcov = opts.pctcov
    union = not opts.supermap
    scov = opts.scov
    sz = Sizes(fastafile)
    sizes = sz.mapping
    iterby = opts.iterby
    qspair = iterby == "query_sbjct"

    if not union:
        querysupermap = blastfile + ".query.supermap"
        if not op.exists(querysupermap):
            supermap(blastfile, filter="query")

        blastfile = querysupermap

    assert op.exists(blastfile)

    covered = 0
    mismatches = 0
    gaps = 0
    alignlen = 0
    queries = set()
    valid = set()
    blast = BlastSlow(blastfile)
    iterator = blast.iter_hits_pair if qspair else blast.iter_hits

    covidstore = {}
    for query, blines in iterator():
        blines = list(blines)
        queries.add(query)

        # per gene report
        this_covered = 0
        this_alignlen = 0
        this_mismatches = 0
        this_gaps = 0
        this_identity = 0

        ranges = []
        for b in blines:
            if scov:
                s, start, stop = b.subject, b.sstart, b.sstop
            else:
                s, start, stop = b.query, b.qstart, b.qstop
            cov_id = s

            if b.pctid < pctid:
                continue

            if start > stop:
                start, stop = stop, start
            this_covered += stop - start + 1
            this_alignlen += b.hitlen
            this_mismatches += b.nmismatch
            this_gaps += b.ngaps
            ranges.append(("1", start, stop))

        if ranges:
            this_identity = 100. - (this_mismatches + this_gaps) * 100. / this_alignlen

        if union:
            this_covered = range_union(ranges)

        this_coverage = this_covered * 100. / sizes[cov_id]
        covidstore[query] = (this_identity, this_coverage)
        if this_identity >= pctid and this_coverage >= pctcov:
            valid.add(query)

        covered += this_covered
        mismatches += this_mismatches
        gaps += this_gaps
        alignlen += this_alignlen

    if opts.list:
        if qspair:
            allpairs = defaultdict(list)
            for (q, s) in covidstore:
                allpairs[q].append((q, s))
                allpairs[s].append((q, s))

            for id, size in sz.iter_sizes():
                if id not in allpairs:
                    print "\t".join((id, "na", "0", "0"))
                else:
                    for qs in allpairs[id]:
                        this_identity, this_coverage = covidstore[qs]
                        print "{0}\t{1:.1f}\t{2:.1f}".format("\t".join(qs), this_identity, this_coverage)
        else:
            for query, size in sz.iter_sizes():
                this_identity, this_coverage = covidstore.get(query, (0, 0))
                print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity, this_coverage)

    mapped_count = len(queries)
    valid_count = len(valid)
    cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts)

    m = "Identity: {0} mismatches, {1} gaps, {2} alignlen\n".\
            format(mismatches, gaps, alignlen)
    total = len(sizes.keys())
    m += "Total mapped: {0} ({1:.1f}% of {2})\n".\
            format(mapped_count, mapped_count * 100. / total, total)
    m += "Total valid {0}: {1} ({2:.1f}% of {3})\n".\
            format(cutoff_message, valid_count, valid_count * 100. / total, total)
    m += "Average id = {0:.2f}%\n".\
            format(100 - (mismatches + gaps) * 100. / alignlen)

    queries_combined = sz.totalsize
    m += "Coverage: {0} covered, {1} total\n".\
            format(covered, queries_combined)
    m += "Average coverage = {0:.2f}%".\
            format(covered * 100. / queries_combined)

    logfile = blastfile + ".covfilter.log"
    fw = open(logfile, "w")
    for f in (sys.stderr, fw):
        print >> f, m
    fw.close()

    if opts.ids:
        filename = opts.ids
        fw = must_open(filename, "w")
        for id in valid:
            print >> fw, id
        logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\
                format(cutoff_message, filename))

    outfile = opts.outfile
    if not outfile:
        return

    fw = must_open(outfile, "w")
    blast = Blast(blastfile)
    for b in blast:
        query = (b.query, b.subject) if qspair else b.query
        if query in valid:
            print >> fw, b
Example #4
0
File: qc.py Project: bennyyu/jcvi
def rnaseq(args):
    """
    %prog rnaseq blastfile ref.fasta

    Evaluate de-novo RNA-seq assembly against a reference gene set (same or
    closely related organism). Ideally blatfile needs to be supermap'd.

    Following metric is used (Martin et al. 2010, Rnnotator paper):
    Accuracy: % of contigs share >=95% identity with ref genome (TODO)
    Completeness: % of ref genes covered by contigs to >=80% of their lengths
    Contiguity: % of ref genes covered by a *single* contig >=80% of lengths
    Chimer: % of contigs that contain two or more annotated genes >= 50bp
    """
    from jcvi.algorithms.supermap import supermap

    p = OptionParser(rnaseq.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    blastfile, reffasta = args
    sizes = Sizes(reffasta).mapping
    known_genes = len(sizes)

    querysupermap = blastfile + ".query.supermap"
    refsupermap = blastfile + ".ref.supermap"

    if not op.exists(querysupermap):
        supermap(blastfile, filter="query")
    if not op.exists(refsupermap):
        supermap(blastfile, filter="ref")

    blast = Blast(querysupermap)
    chimers = 0
    goodctg80 = set()
    goodctg50 = set()
    for ctg, hits in blast.iter_hits():
        bps = defaultdict(int)
        for x in hits:
            bps[x.subject] += abs(x.sstop - x.sstart) + 1

        valid_hits = bps.items()
        for vh, length in valid_hits:
            rsize = sizes[vh]
            ratio = length * 100. / rsize
            if ratio >= 80:
                goodctg80.add(ctg)
            if ratio >= 50:
                goodctg50.add(ctg)

        # Chimer
        if len(valid_hits) > 1:
            chimers += 1

    blast = Blast(refsupermap)
    goodref80 = set()
    goodref50 = set()
    bps = defaultdict(int)
    for x in blast.iter_line():
        bps[x.subject] += abs(x.sstop - x.sstart) + 1

    for vh, length in bps.items():
        rsize = sizes[vh]
        ratio = length * 100. / rsize
        if ratio >= 80:
            goodref80.add(vh)
        if ratio >= 50:
            goodref50.add(vh)

    print >> sys.stderr, "Reference set: `{0}`,  # of transcripts {1}".\
            format(reffasta, known_genes)
    print >> sys.stderr, "A total of {0} contigs map to 80% of a reference"\
            " transcript".format(len(goodctg80))
    print >> sys.stderr, "A total of {0} contigs map to 50% of a reference"\
            " transcript".format(len(goodctg50))
    print >> sys.stderr, "A total of {0} reference transcripts ({1:.1f}%) have 80% covered" \
            .format(len(goodref80), len(goodref80) * 100. / known_genes)
Example #5
0
def covfilter(args):
    """
    %prog covfilter blastfile fastafile

    Fastafile is used to get the sizes of the queries. Two filters can be
    applied, the id% and cov%.
    """
    p = OptionParser(covfilter.__doc__)
    p.add_option("--pctid",
                 dest="pctid",
                 default=90,
                 type="int",
                 help="Percentage identity cutoff [default: %default]")
    p.add_option("--pctcov",
                 dest="pctcov",
                 default=50,
                 type="int",
                 help="Percentage identity cutoff [default: %default]")
    p.add_option("--ids",
                 dest="ids",
                 default=None,
                 help="Print out the ids that satisfy [default: %default]")
    p.add_option("--list",
                 dest="list",
                 default=False,
                 action="store_true",
                 help="List the id% and cov% per gene [default: %default]")
    set_outfile(p, outfile=None)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    from jcvi.algorithms.supermap import supermap

    blastfile, fastafile = args
    sizes = Sizes(fastafile).mapping
    querysupermap = blastfile + ".query.supermap"
    if not op.exists(querysupermap):
        supermap(blastfile, filter="query")

    blastfile = querysupermap
    assert op.exists(blastfile)

    covered = 0
    mismatches = 0
    gaps = 0
    alignlen = 0
    queries = set()
    valid = set()
    blast = BlastSlow(querysupermap)
    for query, blines in blast.iter_hits():
        blines = list(blines)
        queries.add(query)

        # per gene report
        this_covered = 0
        this_alignlen = 0
        this_mismatches = 0
        this_gaps = 0

        for b in blines:
            this_covered += abs(b.qstart - b.qstop + 1)
            this_alignlen += b.hitlen
            this_mismatches += b.nmismatch
            this_gaps += b.ngaps

        this_identity = 100. - (this_mismatches +
                                this_gaps) * 100. / this_alignlen
        this_coverage = this_covered * 100. / sizes[query]

        if opts.list:
            print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity,
                                                 this_coverage)

        if this_identity >= opts.pctid and this_coverage >= opts.pctcov:
            valid.add(query)

        covered += this_covered
        mismatches += this_mismatches
        gaps += this_gaps
        alignlen += this_alignlen

    mapped_count = len(queries)
    valid_count = len(valid)
    cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts)

    print >> sys.stderr, "Identity: {0} mismatches, {1} gaps, {2} alignlen".\
            format(mismatches, gaps, alignlen)
    total = len(sizes.keys())
    print >> sys.stderr, "Total mapped: {0} ({1:.1f}% of {2})".\
            format(mapped_count, mapped_count * 100. / total, total)
    print >> sys.stderr, "Total valid {0}: {1} ({2:.1f}% of {3})".\
            format(cutoff_message, valid_count, valid_count * 100. / total, total)
    print >> sys.stderr, "Average id = {0:.2f}%".\
            format(100 - (mismatches + gaps) * 100. / alignlen)

    queries_combined = sum(sizes[x] for x in queries)
    print >> sys.stderr, "Coverage: {0} covered, {1} total".\
            format(covered, queries_combined)
    print >> sys.stderr, "Average coverage = {0:.2f}%".\
            format(covered * 100. / queries_combined)

    if opts.ids:
        filename = opts.ids
        fw = must_open(filename, "w")
        for id in valid:
            print >> fw, id
        logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\
                format(cutoff_message, filename))

    outfile = opts.outfile
    if not outfile:
        return

    fp = open(blastfile)
    fw = must_open(outfile, "w")
    blast = Blast(blastfile)
    for b in blast.iter_line():
        if b.query in valid:
            print >> fw, b
Example #6
0
File: qc.py Project: linlifeng/jcvi
def rnaseq(args):
    """
    %prog rnaseq blastfile ref.fasta

    Evaluate de-novo RNA-seq assembly against a reference gene set (same or
    closely related organism). Ideally blatfile needs to be supermap'd.

    Following metric is used (Martin et al. 2010, Rnnotator paper):
    Accuracy: % of contigs share >=95% identity with ref genome (TODO)
    Completeness: % of ref genes covered by contigs to >=80% of their lengths
    Contiguity: % of ref genes covered by a *single* contig >=80% of lengths
    Chimer: % of contigs that contain two or more annotated genes >= 50bp
    """
    from jcvi.algorithms.supermap import supermap

    p = OptionParser(rnaseq.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    blastfile, reffasta = args
    sizes = Sizes(reffasta).mapping
    known_genes = len(sizes)

    querysupermap = blastfile + ".query.supermap"
    refsupermap = blastfile + ".ref.supermap"

    if not op.exists(querysupermap):
        supermap(blastfile, filter="query")
    if not op.exists(refsupermap):
        supermap(blastfile, filter="ref")

    blast = Blast(querysupermap)
    chimers = 0
    goodctg80 = set()
    goodctg50 = set()
    for ctg, hits in blast.iter_hits():
        bps = defaultdict(int)
        for x in hits:
            bps[x.subject] += abs(x.sstop - x.sstart) + 1

        valid_hits = bps.items()
        for vh, length in valid_hits:
            rsize = sizes[vh]
            ratio = length * 100. / rsize
            if ratio >= 80:
                goodctg80.add(ctg)
            if ratio >= 50:
                goodctg50.add(ctg)

        # Chimer
        if len(valid_hits) > 1:
            chimers += 1

    blast = Blast(refsupermap)
    goodref80 = set()
    goodref50 = set()
    bps = defaultdict(int)
    for x in blast.iter_line():
        bps[x.subject] += abs(x.sstop - x.sstart) + 1

    for vh, length in bps.items():
        rsize = sizes[vh]
        ratio = length * 100. / rsize
        if ratio >= 80:
            goodref80.add(vh)
        if ratio >= 50:
            goodref50.add(vh)

    print >> sys.stderr, "Reference set: `{0}`,  # of transcripts {1}".\
            format(reffasta, known_genes)
    print >> sys.stderr, "A total of {0} contigs map to 80% of a reference"\
            " transcript".format(len(goodctg80))
    print >> sys.stderr, "A total of {0} contigs map to 50% of a reference"\
            " transcript".format(len(goodctg50))
    print >> sys.stderr, "A total of {0} reference transcripts ({1:.1f}%) have 80% covered" \
            .format(len(goodref80), len(goodref80) * 100. / known_genes)