Esempio n. 1
0
def synfind(args):
    """
    %prog synfind all.last *.bed

    Prepare input for SynFind.
    """
    p = OptionParser(synfind.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    lastfile = args[0]
    bedfiles = args[1:]
    fp = open(lastfile)
    filteredlast = lastfile + ".filtered"
    fw = open(filteredlast, "w")
    for row in fp:
        b = BlastLine(row)
        if b.query == b.subject:
            continue
        print >> fw, b
    fw.close()
    logging.debug("Filtered LAST file written to `{0}`".format(filteredlast))

    allbed = "all.bed"
    fw = open(allbed, "w")
    for i, bedfile in enumerate(bedfiles):
        prefix = chr(ord('A') + i)
        bed = Bed(bedfile)
        for b in bed:
            b.seqid = prefix + b.seqid
            print >> fw, b
    fw.close()
    logging.debug("Bed file written to `{0}`".format(allbed))
Esempio n. 2
0
def synfind(args):
    """
    %prog synfind all.last *.bed

    Prepare input for SynFind.
    """
    p = OptionParser(synfind.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    lastfile = args[0]
    bedfiles = args[1:]
    fp = open(lastfile)
    filteredlast = lastfile + ".filtered"
    fw = open(filteredlast, "w")
    for row in fp:
        b = BlastLine(row)
        if b.query == b.subject:
            continue
        print(b, file=fw)
    fw.close()
    logging.debug("Filtered LAST file written to `{0}`".format(filteredlast))

    allbed = "all.bed"
    fw = open(allbed, "w")
    for i, bedfile in enumerate(bedfiles):
        prefix = chr(ord('A') + i)
        bed = Bed(bedfile)
        for b in bed:
            b.seqid = prefix + b.seqid
            print(b, file=fw)
    fw.close()
    logging.debug("Bed file written to `{0}`".format(allbed))
Esempio n. 3
0
def read_blast(blast_file, qorder, sorder, is_self=False):
    """
    read the blast and convert name into coordinates
    """
    fp = open(blast_file)
    filtered_blast = []
    seen = set()
    for row in fp:
        b = BlastLine(row)
        query, subject = b.query, b.subject
        if query not in qorder or subject not in sorder:
            continue

        key = query, subject
        if key in seen:
            continue
        seen.add(key)

        qi, q = qorder[query]
        si, s = sorder[subject]

        if is_self and qi > si:
            # remove redundant a<->b to one side when doing self-self BLAST
            query, subject = subject, query
            qi, si = si, qi
            q, s = s, q

        b.qseqid, b.sseqid = q.seqid, s.seqid
        b.qi, b.si = qi, si

        filtered_blast.append(b)

    return filtered_blast
Esempio n. 4
0
def read_blast(blast_file, qorder, sorder, is_self=False):
    """
    read the blast and convert name into coordinates
    """
    fp = open(blast_file)
    filtered_blast = []
    seen = set()
    for row in fp:
        b = BlastLine(row)
        query, subject = b.query, b.subject
        if query not in qorder or subject not in sorder:
            continue

        key = query, subject
        if key in seen:
            continue
        seen.add(key)

        qi, q = qorder[query]
        si, s = sorder[subject]

        if is_self and qi > si:
            # remove redundant a<->b to one side when doing self-self BLAST
            query, subject = subject, query
            qi, si = si, qi
            q, s = s, q

        b.qseqid, b.sseqid = q.seqid, s.seqid
        b.qi, b.si = qi, si

        filtered_blast.append(b)

    return filtered_blast
Esempio n. 5
0
def dedup(args):
    """
    %prog dedup assembly.assembly.blast assembly.fasta

    Remove duplicate contigs within assembly.
    """
    from jcvi.formats.blast import BlastLine

    p = OptionParser(dedup.__doc__)
    p.set_align(pctid=0, pctcov=98)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    blastfile, fastafile = args
    cov = opts.pctcov / 100.0
    sizes = Sizes(fastafile).mapping
    fp = open(blastfile)
    removed = set()
    for row in fp:
        b = BlastLine(row)
        query, subject = b.query, b.subject
        if query == subject:
            continue
        qsize, ssize = sizes[query], sizes[subject]
        qspan = abs(b.qstop - b.qstart)
        if qspan < qsize * cov:
            continue
        if (qsize, query) < (ssize, subject):
            removed.add(query)

    print("\n".join(sorted(removed)))
Esempio n. 6
0
def make_arrays(blastfile, qpadbed, spadbed, qpadnames, spadnames):
    """
    This function makes three matrices: observed, expected and logmp. The logmp
    contains the statistical significance for each comparison.
    """
    m, n = len(qpadnames), len(spadnames)
    qpadorder, spadorder = qpadbed.order, spadbed.order
    qpadid = dict((a, i) for i, a in enumerate(qpadnames))
    spadid = dict((a, i) for i, a in enumerate(spadnames))
    qpadlen = dict((a, len(b)) for a, b in qpadbed.sub_beds())
    spadlen = dict((a, len(b)) for a, b in spadbed.sub_beds())

    qsize, ssize = len(qpadbed), len(spadbed)

    assert sum(qpadlen.values()) == qsize
    assert sum(spadlen.values()) == ssize

    # Populate arrays of observed counts and expected counts
    logging.debug("Initialize array of size ({0} x {1})".format(m, n))
    observed = np.zeros((m, n))
    fp = open(blastfile)
    all_dots = 0
    for row in fp:
        b = BlastLine(row)
        qi, q = qpadorder[b.query]
        si, s = spadorder[b.subject]
        qseqid, sseqid = q.seqid, s.seqid
        qsi, ssi = qpadid[qseqid], spadid[sseqid]
        observed[qsi, ssi] += 1
        all_dots += 1

    assert int(round(observed.sum())) == all_dots

    logging.debug("Total area: {0} x {1}".format(qsize, ssize))
    S = qsize * ssize
    expected = np.zeros((m, n))
    qsum = 0
    for i, a in enumerate(qpadnames):
        alen = qpadlen[a]
        qsum += alen
        for j, b in enumerate(spadnames):
            blen = spadlen[b]
            expected[i, j] = all_dots * alen * blen * 1.0 / S

    assert int(round(expected.sum())) == all_dots

    # Calculate the statistical significance for each cell
    from scipy.stats.distributions import poisson

    M = m * n  # multiple testing
    logmp = np.zeros((m, n))
    for i in range(m):
        for j in range(n):
            obs, exp = observed[i, j], expected[i, j]
            pois = max(poisson.pmf(obs, exp), 1e-250)  # Underflow
            logmp[i, j] = max(-log(pois), 0)

    return logmp
Esempio n. 7
0
def cyntenator(args):
    """
    %prog cyntenator athaliana.athaliana.last athaliana.bed

    Prepare input for Cyntenator.
    """
    p = OptionParser(cyntenator.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    lastfile = args[0]
    fp = open(lastfile)
    filteredlastfile = lastfile + ".blast"
    fw = open(filteredlastfile, "w")
    for row in fp:
        b = BlastLine(row)
        if b.query == b.subject:
            continue
        print("\t".join((b.query, b.subject, str(b.score))), file=fw)
    fw.close()

    bedfiles = args[1:]
    fp = open(lastfile)
    b = BlastLine(next(fp))
    subject = b.subject
    txtfiles = []
    for bedfile in bedfiles:
        order = Bed(bedfile).order
        if subject in order:
            db = op.basename(bedfile).split(".")[0][:20]
            logging.debug("Found db: {0}".format(db))
        txtfile = write_txt(bedfile)
        txtfiles.append(txtfile)

    db += ".txt"
    mm = MakeManager()
    for txtfile in txtfiles:
        outfile = txtfile + ".alignment"
        cmd = 'cyntenator -t "({0} {1})" -h blast {2} > {3}'\
                .format(txtfile, db, filteredlastfile, outfile)
        mm.add((txtfile, db, filteredlastfile), outfile, cmd)
    mm.write()
Esempio n. 8
0
def read_blast(blast_file, qorder, sorder, is_self=False, ostrip=True):
    """
    read the blast and convert name into coordinates
    """
    fp = open(blast_file)
    filtered_blast = []
    seen = set()
    for row in fp:
        b = BlastLine(row)
        query, subject = b.query, b.subject
        if query == subject:
            continue
        if ostrip:
            query, subject = gene_name(query), gene_name(subject)
        if query not in qorder or subject not in sorder:
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]

        if is_self and qi > si:
            # remove redundant a<->b to one side when doing self-self BLAST
            query, subject = subject, query
            qi, si = si, qi
            q, s = s, q

        key = query, subject
        if key in seen:
            continue
        seen.add(key)

        b.qseqid, b.sseqid = q.seqid, s.seqid
        b.qi, b.si = qi, si
        b.query, b.subject = query, subject

        filtered_blast.append(b)

    logging.debug("A total of {0} BLAST imported from `{1}`.".\
                  format(len(filtered_blast), blast_file))

    return filtered_blast
Esempio n. 9
0
def read_blast(blast_file, qorder, sorder, is_self=False, ostrip=True):
    """
    read the blast and convert name into coordinates
    """
    fp = open(blast_file)
    filtered_blast = []
    seen = set()
    for row in fp:
        b = BlastLine(row)
        query, subject = b.query, b.subject
        if query == subject:
            continue
        if ostrip:
            query, subject = gene_name(query), gene_name(subject)
        if query not in qorder or subject not in sorder:
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]

        if is_self and qi > si:
            # remove redundant a<->b to one side when doing self-self BLAST
            query, subject = subject, query
            qi, si = si, qi
            q, s = s, q

        key = query, subject
        if key in seen:
            continue
        seen.add(key)

        b.qseqid, b.sseqid = q.seqid, s.seqid
        b.qi, b.si = qi, si
        b.query, b.subject = query, subject

        filtered_blast.append(b)

    logging.debug("A total of {0} BLAST imported from `{1}`.".\
                  format(len(filtered_blast), blast_file))

    return filtered_blast
Esempio n. 10
0
def bes(args):
    """
    %prog bes bacfasta clonename

    Use the clone name to download BES gss sequences from Genbank, map and then
    visualize.
    """
    from jcvi.apps.align import run_blat

    p = OptionParser(bes.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bacfasta, clonename = args

    entrez([clonename, "--database=nucgss", "--skipcheck"])
    besfasta = clonename + ".fasta"
    blatfile = clonename + ".bes.blat"
    run_blat(
        infile=besfasta,
        outfile=blatfile,
        db=bacfasta,
        pctid=95,
        hitlen=100,
        cpus=opts.cpus,
    )

    aid, asize = next(Fasta(bacfasta).itersizes())

    width = 50
    msg = "=" * width
    msg += "  " + aid
    print(msg, file=sys.stderr)

    ratio = width * 1.0 / asize
    _ = lambda x: int(round(x * ratio, 0))
    blasts = [BlastLine(x) for x in open(blatfile)]
    for b in blasts:
        if b.orientation == "+":
            msg = " " * _(b.sstart) + "->"
        else:
            msg = " " * (_(b.sstop) - 2) + "<-"
        msg += " " * (width - len(msg) + 2)
        msg += b.query
        if b.orientation == "+":
            msg += " (hang={0})".format(b.sstart - 1)
        else:
            msg += " (hang={0})".format(asize - b.sstop)

        print(msg, file=sys.stderr)
Esempio n. 11
0
def iadhore(args):
    """
    %prog iadhore athaliana.athaliana.last athaliana.bed

    Wrap around iADHoRe.
    """
    p = OptionParser(iadhore.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    lastfile = args[0]
    bedfiles = args[1:]
    blast_table = "blast_table.txt"
    fp = open(lastfile)
    seen = set()
    for row in fp:
        c = BlastLine(row)
        a, b = c.query, c.subject
        a, b = gene_name(a), gene_name(b)
        if a > b:
            a, b = b, a
        seen.add((a, b))

    fw = open(blast_table, "w")
    for a, b in seen:
        print("\t".join((a, b)), file=fw)
    fw.close()
    logging.debug("A total of {0} pairs written to `{1}`"\
            .format(len(seen), blast_table))

    fw = open("config.txt", "w")
    for bedfile in bedfiles:
        pf, stanza = write_lst(bedfile)
        print("genome={0}".format(pf), file=fw)
        for seqid, fname in stanza:
            print(" ".join((seqid, fname)), file=fw)
        print(file=fw)

    print("blast_table={0}".format(blast_table), file=fw)
    print("cluster_type=colinear", file=fw)
    print("tandem_gap=10", file=fw)
    print("prob_cutoff=0.001", file=fw)
    print("gap_size=20", file=fw)
    print("cluster_gap=20", file=fw)
    print("q_value=0.9", file=fw)
    print("anchor_points=4", file=fw)
    print("alignment_method=gg2", file=fw)
    print("max_gaps_in_alignment=20", file=fw)
    print("output_path=i-adhore_out", file=fw)
    print("number_of_threads=4", file=fw)
    fw.close()
Esempio n. 12
0
def blast(args):
    """
    %prog blast allfasta clonename

    Insert a component into agpfile by aligning to the best hit in pool and see
    if they have good overlaps.
    """
    from jcvi.apps.command import run_megablast

    p = OptionParser(blast.__doc__)
    p.add_option("-n",
                 type="int",
                 default=2,
                 help="Take best N hits [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    allfasta, clonename = args
    fastadir = "fasta"
    infile = op.join(fastadir, clonename + ".fasta")
    if not op.exists(infile):
        fetch([clonename, "--skipcheck", "--outdir=" + fastadir])

    outfile = "{0}.{1}.blast".format(clonename, allfasta.split(".")[0])
    run_megablast(infile=infile, outfile=outfile, db=allfasta, \
            pctid=GoodPct, hitlen=GoodOverlap)

    blasts = [BlastLine(x) for x in open(outfile)]
    besthits = []
    for b in blasts:
        if b.query.count("|") >= 3:
            b.query = b.query.split("|")[3]

        if b.subject.count("|") >= 3:
            b.subject = b.subject.split("|")[3]

        b.query = b.query.rsplit(".", 1)[0]
        b.subject = b.subject.rsplit(".", 1)[0]

        if b.query == b.subject:
            continue

        if b.subject not in besthits:
            besthits.append(b.subject)
        if len(besthits) == opts.n:
            break

    for b in besthits:
        overlap([clonename, b, "--dir=" + fastadir])
Esempio n. 13
0
def blast(args):
    """
    %prog blast fastafile 

    Run BLASTN against database (default is UniVec_Core).  Output .bed format
    on the vector/contaminant ranges.
    """
    p = OptionParser(blast.__doc__)
    p.add_option("--dist",
                 dest="dist",
                 default=100,
                 type="int",
                 help="Merge adjacent HSPs separated by [default: %default]")
    p.add_option("--db",
                 dest="db",
                 default=None,
                 help="Use a different database rather than UniVec_Core")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    fastaprefix = fastafile.split(".", 1)[0]

    univec = opts.db or download(
        "ftp://ftp.ncbi.nih.gov/pub/UniVec/UniVec_Core")
    uniprefix = univec.split(".", 1)[0]

    fastablast = fastaprefix + ".{0}.blast".format(uniprefix)

    prog = run_megablast if opts.db else run_vecscreen
    prog(infile=fastafile, outfile=fastablast, db=univec, pctid=95, hitlen=50)

    fp = open(fastablast)
    ranges = []
    for row in fp:
        b = BlastLine(row)
        ranges.append((b.query, b.qstart, b.qstop))

    merged_ranges = range_merge(ranges, dist=opts.dist)
    bedfile = fastaprefix + ".{0}.bed".format(uniprefix)
    fw = must_open(bedfile, "w")
    for seqid, start, end in merged_ranges:
        print >> fw, "\t".join(
            str(x) for x in (seqid, start - 1, end, uniprefix))

    return bedfile
Esempio n. 14
0
def BlastOrCoordsLine(filename, filter="ref", dialect="blast", clip=0):
    allowed_filters = ("ref", "query")
    REF, QUERY = range(len(allowed_filters))

    allowed_dialects = ("blast", "coords")
    BLAST, COORDS = range(len(allowed_dialects))

    assert filter in allowed_filters
    filter = allowed_filters.index(filter)

    assert dialect in allowed_dialects
    dialect = allowed_dialects.index(dialect)

    fp = open(filename)
    for i, row in enumerate(fp):
        if row[0] == '#':
            continue
        if dialect == BLAST:
            b = BlastLine(row)
            if filter == QUERY:
                query, start, end = b.query, b.qstart, b.qstop
            else:
                query, start, end = b.subject, b.sstart, b.sstop
        else:
            try:
                b = CoordsLine(row)
            except AssertionError:
                continue

            if filter == QUERY:
                query, start, end = b.query, b.start2, b.end2
            else:
                query, start, end = b.ref, b.start1, b.end1

        if start > end:
            start, end = end, start

        if clip:
            # clip cannot be more than 5% of the range
            r = end - start + 1
            cc = min(.05 * r, clip)
            start = start + cc
            end = end - cc

        yield Range(query, start, end, b.score, i)
Esempio n. 15
0
def main(blast_file, cds_file, bed_file, N=3):

    # get the sizes for the CDS first
    f = Fasta(cds_file)
    sizes = dict(f.itersizes())

    # retrieve the locations
    bed = Bed(bed_file).order

    # filter the blast file
    g = Grouper()
    fp = open(blast_file)
    for row in fp:
        b = BlastLine(row)
        query_len = sizes[b.query]
        subject_len = sizes[b.subject]
        if b.hitlen < min(query_len, subject_len) / 2:
            continue

        query, subject = gene_name(b.query), gene_name(b.subject)
        qi, q = bed[query]
        si, s = bed[subject]

        if q.seqid == s.seqid and abs(qi - si) <= N:
            g.join(query, subject)

    # dump the grouper
    ngenes, nfamilies = 0, 0
    families = []
    for group in sorted(g):
        if len(group) >= 2:
            print ",".join(sorted(group))
            ngenes += len(group)
            nfamilies += 1
            families.append(sorted(group))

    longest_family = max(families, key=lambda x: len(x))

    # generate reports
    print >> sys.stderr, "Proximal paralogues (dist=%d):" % N
    print >> sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies)
    print >> sys.stderr, "Longest families (%d): %s" % (
        len(longest_family), ",".join(longest_family))
Esempio n. 16
0
def bed(args):
    """
    %prog bed btabfile

    Convert btab to bed format.
    """
    from jcvi.formats.blast import BlastLine
    p = OptionParser(bed.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    btabfile, = args
    btab = Btab(btabfile)
    for b in btab:
        Bline = BlastLine(b.blastline)
        print Bline.bedline
Esempio n. 17
0
def overlap(args):
    """
    %prog overlap <a|a.fasta> <b|b.fasta>

    Check overlaps between two fasta records. The arguments can be genBank IDs
    instead of FASTA files. In case of IDs, the sequences will be downloaded
    first.
    """
    from jcvi.apps.command import BLPATH
    from jcvi.formats.blast import chain_HSPs

    p = OptionParser(overlap.__doc__)
    p.add_option("--dir",
                 default=os.getcwd(),
                 help="Download sequences to dir [default: %default]")
    p.add_option("--qreverse",
                 default=False,
                 action="store_true",
                 help="Reverse seq a [default: %default]")
    p.add_option("--nochain",
                 default=False,
                 action="store_true",
                 help="Do not chain adjacent HSPs [default: chain HSPs]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    afasta, bfasta = args
    dir = opts.dir
    chain = not opts.nochain

    # Check first whether it is file or accession name
    if not op.exists(afasta):
        af = op.join(dir, afasta + ".fasta")
        if not op.exists(af):  # Check to avoid redownload
            fetch([afasta, "--skipcheck", "--outdir=" + dir])
        afasta = af

    if not op.exists(bfasta):
        bf = op.join(dir, bfasta + ".fasta")
        if not op.exists(bf):
            fetch([bfasta, "--skipcheck", "--outdir=" + dir])
        bfasta = bf

    assert op.exists(afasta) and op.exists(bfasta)

    cmd = BLPATH("blastn")
    cmd += " -query {0} -subject {1}".format(afasta, bfasta)
    cmd += " -evalue 0.01 -outfmt 6 -perc_identity {0}".format(GoodPct)

    fp = popen(cmd)
    hsps = fp.readlines()

    hsps = [BlastLine(x) for x in hsps]
    hsps = [x for x in hsps if x.hitlen >= GoodOverlap]
    dist = 2 * GoodOverlap  # Distance to chain the HSPs
    if chain:
        logging.debug("Chain HSPs in the Blast output.")
        hsps = chain_HSPs(hsps, xdist=dist, ydist=dist)

    if len(hsps) == 0:
        print >> sys.stderr, "No match found."
        return None

    besthsp = hsps[0]

    aid, asize = Fasta(afasta).itersizes().next()
    bid, bsize = Fasta(bfasta).itersizes().next()
    o = Overlap(besthsp, asize, bsize)
    o.print_graphic(qreverse=opts.qreverse)
    print >> sys.stderr, str(o)

    return o
Esempio n. 18
0
def blastfilter_main(blast_file, p, opts):

    qbed, sbed, qorder, sorder, is_self = check_beds(blast_file, p, opts)

    tandem_Nmax = opts.tandem_Nmax
    cscore = opts.cscore

    fp = file(blast_file)
    total_lines = sum(1 for line in fp)
    logging.debug("Load BLAST file `%s` (total %d lines)" % \
            (blast_file, total_lines))
    fp.seek(0)
    blasts = sorted([BlastLine(line) for line in fp], \
            key=lambda b: b.score, reverse=True)

    filtered_blasts = []
    seen = set()
    ostrip = opts.strip_names
    nwarnings = 0
    for b in blasts:
        query, subject = b.query, b.subject
        if query == subject:
            continue

        if ostrip:
            query, subject = gene_name(query), gene_name(subject)
        if query not in qorder:
            if nwarnings < 100:
                logging.warning("{0} not in {1}".format(query,
                    qbed.filename))
            elif nwarnings == 100:
                logging.warning("too many warnings.. suppressed")
            nwarnings += 1
            continue
        if subject not in sorder:
            if nwarnings < 100:
                logging.warning("{0} not in {1}".format(subject,
                    sbed.filename))
            elif nwarnings == 100:
                logging.warning("too many warnings.. suppressed")
            nwarnings += 1
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]

        if is_self and qi > si:
            # move all hits to same side when doing self-self BLAST
            query, subject = subject, query
            qi, si = si, qi
            q, s = s, q

        key = query, subject
        if key in seen:
            continue
        seen.add(key)
        b.query, b.subject = key

        b.qi, b.si = qi, si
        b.qseqid, b.sseqid = q.seqid, s.seqid

        filtered_blasts.append(b)

    if cscore:
        before_filter = len(filtered_blasts)
        logging.debug("running the cscore filter (cscore>=%.2f) .." % cscore)
        filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore))
        logging.debug("after filter (%d->%d) .." % (before_filter,
            len(filtered_blasts)))

    if tandem_Nmax:
        logging.debug("running the local dups filter (tandem_Nmax=%d) .." % \
                tandem_Nmax)

        qtandems = tandem_grouper(qbed, filtered_blasts,
                flip=True, tandem_Nmax=tandem_Nmax)
        standems = tandem_grouper(sbed, filtered_blasts,
                flip=False, tandem_Nmax=tandem_Nmax)

        qdups_fh = open(op.splitext(opts.qbed)[0] + ".localdups", "w") \
                if opts.tandems_only else None

        if is_self:
            for s in standems:
                qtandems.join(*s)
            qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh)
            sdups_to_mother = qdups_to_mother
        else:
            qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh)
            sdups_fh = open(op.splitext(opts.sbed)[0] + ".localdups", "w") \
                    if opts.tandems_only else None
            sdups_to_mother = write_localdups(standems, sbed, sdups_fh)

        if opts.tandems_only:
            # write out new .bed after tandem removal
            write_new_bed(qbed, qdups_to_mother)
            if not is_self:
                write_new_bed(sbed, sdups_to_mother)

            # just want to use this script as a tandem finder.
            sys.exit()

        before_filter = len(filtered_blasts)
        filtered_blasts = list(filter_tandem(filtered_blasts, \
                qdups_to_mother, sdups_to_mother))
        logging.debug("after filter (%d->%d) .." % \
                (before_filter, len(filtered_blasts)))

    blastfilteredfile = blast_file + ".filtered"
    fw = open(blastfilteredfile, "w")
    write_new_blast(filtered_blasts, fh=fw)
    fw.close()
Esempio n. 19
0
def blastplot(
    ax,
    blastfile,
    qsizes,
    ssizes,
    qbed,
    sbed,
    style="dot",
    sampleN=None,
    baseticks=False,
    insetLabels=False,
    stripNames=False,
    highlights=None,
):

    assert style in DotStyles
    fp = open(blastfile)

    qorder = qbed.order if qbed else None
    sorder = sbed.order if sbed else None

    data = []

    for row in fp:
        b = BlastLine(row)
        query, subject = b.query, b.subject

        if stripNames:
            query = query.rsplit(".", 1)[0]
            subject = subject.rsplit(".", 1)[0]

        if qorder:
            if query not in qorder:
                continue
            qi, q = qorder[query]
            query = q.seqid
            qstart, qend = q.start, q.end
        else:
            qstart, qend = b.qstart, b.qstop

        if sorder:
            if subject not in sorder:
                continue
            si, s = sorder[subject]
            subject = s.seqid
            sstart, send = s.start, s.end
        else:
            sstart, send = b.sstart, b.sstop

        qi = qsizes.get_position(query, qstart)
        qj = qsizes.get_position(query, qend)
        si = ssizes.get_position(subject, sstart)
        sj = ssizes.get_position(subject, send)

        if None in (qi, si):
            continue
        data.append(((qi, qj), (si, sj)))

    if sampleN:
        if len(data) > sampleN:
            data = sample(data, sampleN)

    if not data:
        return logging.error("no blast data imported")

    xsize, ysize = qsizes.totalsize, ssizes.totalsize
    logging.debug("xsize=%d ysize=%d" % (xsize, ysize))

    if style == "line":
        for a, b in data:
            ax.plot(a, b, "ro-", mfc="w", mec="r", ms=3)
    else:
        data = [(x[0], y[0]) for x, y in data]
        x, y = zip(*data)

        if style == "circle":
            ax.plot(x, y, "mo", mfc="w", mec="m", ms=3)
        elif style == "dot":
            ax.scatter(x, y, s=3, lw=0)

    xlim = (0, xsize)
    ylim = (ysize, 0)  # invert the y-axis

    xchr_labels, ychr_labels = [], []
    ignore = True  # tag to mark whether to plot chr name (skip small ones)
    ignore_size_x = ignore_size_y = 0

    # plot the chromosome breaks
    logging.debug("xbreaks={0} ybreaks={1}".format(len(qsizes), len(ssizes)))
    for (seqid, beg, end) in qsizes.get_breaks():
        ignore = abs(end - beg) < ignore_size_x
        if ignore:
            continue
        seqid = rename_seqid(seqid)

        xchr_labels.append((seqid, (beg + end) / 2, ignore))
        ax.plot([end, end], ylim, "-", lw=1, color="grey")

    for (seqid, beg, end) in ssizes.get_breaks():
        ignore = abs(end - beg) < ignore_size_y
        if ignore:
            continue
        seqid = rename_seqid(seqid)

        ychr_labels.append((seqid, (beg + end) / 2, ignore))
        ax.plot(xlim, [end, end], "-", lw=1, color="grey")

    # plot the chromosome labels
    for label, pos, ignore in xchr_labels:
        if not ignore:
            if insetLabels:
                ax.text(pos,
                        0,
                        label,
                        size=8,
                        ha="center",
                        va="top",
                        color="grey")
            else:
                pos = 0.1 + pos * 0.8 / xsize
                root.text(
                    pos,
                    0.91,
                    label,
                    size=10,
                    ha="center",
                    va="bottom",
                    rotation=45,
                    color="grey",
                )

    # remember y labels are inverted
    for label, pos, ignore in ychr_labels:
        if not ignore:
            if insetLabels:
                continue
            pos = 0.9 - pos * 0.8 / ysize
            root.text(0.91, pos, label, size=10, va="center", color="grey")

    # Highlight regions based on a list of BedLine
    qhighlights = shighlights = None
    if highlights:
        if isinstance(highlights[0], BedLine):
            shighlights = highlights
        elif len(highlights) == 2:
            qhighlights, shighlights = highlights

    if qhighlights:
        for hl in qhighlights:
            hls = qsizes.get_position(hl.seqid, hl.start)
            ax.add_patch(
                Rectangle((hls, 0), hl.span, ysize, fc="r", alpha=0.2, lw=0))
    if shighlights:
        for hl in shighlights:
            hls = ssizes.get_position(hl.seqid, hl.start)
            ax.add_patch(
                Rectangle((0, hls), xsize, hl.span, fc="r", alpha=0.2, lw=0))

    if baseticks:

        def increaseDensity(a, ratio=4):
            assert len(a) > 1
            stepsize = a[1] - a[0]
            newstepsize = int(stepsize / ratio)
            return np.arange(0, a[-1], newstepsize)

        # Increase the density of the ticks
        xticks = ax.get_xticks()
        yticks = ax.get_yticks()
        xticks = increaseDensity(xticks, ratio=2)
        yticks = increaseDensity(yticks, ratio=2)
        ax.set_xticks(xticks)

        # Plot outward ticklines
        for pos in xticks[1:]:
            if pos > xsize:
                continue
            pos = 0.1 + pos * 0.8 / xsize
            root.plot((pos, pos), (0.08, 0.1), "-", color="grey", lw=2)

        for pos in yticks[1:]:
            if pos > ysize:
                continue
            pos = 0.9 - pos * 0.8 / ysize
            root.plot((0.09, 0.1), (pos, pos), "-", color="grey", lw=2)

    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

    # beautify the numeric axis
    for tick in ax.get_xticklines() + ax.get_yticklines():
        tick.set_visible(False)

    set_human_base_axis(ax)

    plt.setp(ax.get_xticklabels() + ax.get_yticklabels(),
             color="gray",
             size=10)
    plt.setp(ax.get_yticklabels(), rotation=90)
Esempio n. 20
0
def tandem_main(blast_file, cds_file, bed_file, N=3, P=50, is_self=True, \
    evalue=.01, strip_name=".", ofile=sys.stderr, genefam=False):

    if genefam:
        N = 1e5

    # get the sizes for the CDS first
    f = Fasta(cds_file)
    sizes = dict(f.itersizes())

    # retrieve the locations
    bed = Bed(bed_file)
    order = bed.order

    if is_self:
        # filter the blast file
        g = Grouper()
        fp = open(blast_file)
        for row in fp:
            b = BlastLine(row)
            query_len = sizes[b.query]
            subject_len = sizes[b.subject]
            if b.hitlen < min(query_len, subject_len) * P / 100.:
                continue

            query = gene_name(b.query, strip_name)
            subject = gene_name(b.subject, strip_name)
            qi, q = order[query]
            si, s = order[subject]

            if abs(qi - si) <= N and b.evalue <= evalue:
                if genefam:
                    g.join(query, subject)
                elif q.seqid == s.seqid:
                    g.join(query, subject)

    else:
        homologs = Grouper()
        fp = open(blast_file)
        for row in fp:
            b = BlastLine(row)
            query_len = sizes[b.query]
            subject_len = sizes[b.subject]
            if b.hitlen < min(query_len, subject_len) * P / 100.:
                continue
            if b.evalue > evalue:
                continue

            query = gene_name(b.query, strip_name)
            subject = gene_name(b.subject, strip_name)
            homologs.join(query, subject)

        if genefam:
            g = homologs
        else:
            g = Grouper()
            for i, atom in enumerate(bed):
                for x in range(1, N + 1):
                    if all([i-x >= 0, bed[i-x].seqid == atom.seqid, \
                        homologs.joined(bed[i-x].accn, atom.accn)]):
                        leni = sizes[bed[i].accn]
                        lenx = sizes[bed[i - x].accn]
                        if abs(leni - lenx) > max(leni, lenx) * (1 - P / 100.):
                            continue
                        g.join(bed[i - x].accn, atom.accn)

    # dump the grouper
    fw = must_open(ofile, "w")
    ngenes, nfamilies = 0, 0
    families = []
    for group in sorted(g):
        if len(group) >= 2:
            print >> fw, ",".join(sorted(group))
            ngenes += len(group)
            nfamilies += 1
            families.append(sorted(group))

    longest_family = max(families, key=lambda x: len(x))

    # generate reports
    print >> sys.stderr, "Proximal paralogues (dist=%d):" % N
    print >> sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies)
    print >> sys.stderr, "Longest families (%d): %s" % (
        len(longest_family), ",".join(longest_family))

    return families
Esempio n. 21
0
def overlap(args):
    """
    %prog overlap <a|a.fasta> <b|b.fasta>

    Check overlaps between two fasta records. The arguments can be genBank IDs
    instead of FASTA files. In case of IDs, the sequences will be downloaded
    first.
    """
    from jcvi.formats.blast import chain_HSPs

    p = OptionParser(overlap.__doc__)
    p.add_option("--dir",
                 default=os.getcwd(),
                 help="Download sequences to dir [default: %default]")
    p.add_option("--suffix",
                 default="fasta",
                 help="Suffix of the sequence file in dir [default: %default]")
    p.add_option("--qreverse",
                 default=False,
                 action="store_true",
                 help="Reverse seq a [default: %default]")
    p.add_option("--nochain",
                 default=False,
                 action="store_true",
                 help="Do not chain adjacent HSPs [default: chain HSPs]")
    p.set_align(pctid=GoodPct, hitlen=GoodOverlap, evalue=.01)
    p.set_outfile(outfile=None)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    afasta, bfasta = args
    dir = opts.dir
    chain = not opts.nochain
    suffix = opts.suffix
    evalue = opts.evalue
    pctid = opts.pctid
    hitlen = opts.hitlen
    cutoff = Cutoff(pctid, hitlen)

    # Check first whether it is file or accession name
    if not op.exists(afasta):
        af = op.join(dir, ".".join((afasta, suffix)))
        if not op.exists(af):  # Check to avoid redownload
            entrez([afasta, "--skipcheck", "--outdir=" + dir])
        afasta = af

    if not op.exists(bfasta):
        bf = op.join(dir, ".".join((bfasta, suffix)))
        if not op.exists(bf):
            entrez([bfasta, "--skipcheck", "--outdir=" + dir])
        bfasta = bf

    assert op.exists(afasta) and op.exists(bfasta)

    cmd = "blastn -dust no"
    cmd += " -query {0} -subject {1}".format(afasta, bfasta)
    cmd += " -evalue {0} -outfmt 6 -perc_identity {1}".format(evalue, pctid)

    fp = popen(cmd)
    hsps = fp.readlines()

    hsps = [BlastLine(x) for x in hsps]
    hsps = [x for x in hsps if x.hitlen >= hitlen]
    if chain:
        logging.debug("Chain HSPs in the Blast output.")
        dist = 2 * hitlen  # Distance to chain the HSPs
        hsps = chain_HSPs(hsps, xdist=dist, ydist=dist)

    if len(hsps) == 0:
        print >> sys.stderr, "No match found."
        return None

    besthsp = hsps[0]

    aid, asize = Fasta(afasta).itersizes().next()
    bid, bsize = Fasta(bfasta).itersizes().next()
    o = Overlap(besthsp, asize, bsize, cutoff, qreverse=opts.qreverse)
    o.print_graphic()

    if opts.outfile:
        fw = must_open(opts.outfile, "w")
        print >> fw, str(o)
        fw.close()

    return o