Exemple #1
0
def prepare_synteny(tourfile, lastfile, odir, p, opts):
    """
    Prepare synteny plots for movie().
    """
    qbedfile, sbedfile = get_bed_filenames(lastfile, p, opts)
    qbedfile = op.abspath(qbedfile)
    sbedfile = op.abspath(sbedfile)

    qbed = Bed(qbedfile, sorted=False)
    contig_to_beds = dict(qbed.sub_beds())

    # Create a separate directory for the subplots and movie
    mkdir(odir, overwrite=True)
    os.chdir(odir)
    logging.debug("Change into subdir `{}`".format(odir))

    # Make anchorsfile
    anchorsfile = ".".join(op.basename(lastfile).split(".", 2)[:2]) \
                  + ".anchors"
    fw = open(anchorsfile, "w")
    for b in Blast(lastfile):
        print >> fw, "\t".join((gene_name(b.query), gene_name(b.subject),
                                str(int(b.score))))
    fw.close()

    # Symlink sbed
    symlink(sbedfile, op.basename(sbedfile))

    return anchorsfile, qbedfile, contig_to_beds
Exemple #2
0
def prepare_synteny(tourfile, lastfile, odir, p, opts):
    """
    Prepare synteny plots for movie().
    """
    qbedfile, sbedfile = get_bed_filenames(lastfile, p, opts)
    qbedfile = op.abspath(qbedfile)
    sbedfile = op.abspath(sbedfile)

    qbed = Bed(qbedfile, sorted=False)
    contig_to_beds = dict(qbed.sub_beds())

    # Create a separate directory for the subplots and movie
    mkdir(odir, overwrite=True)
    os.chdir(odir)
    logging.debug("Change into subdir `{}`".format(odir))

    # Make anchorsfile
    anchorsfile = ".".join(op.basename(lastfile).split(".",
                                                       2)[:2]) + ".anchors"
    fw = open(anchorsfile, "w")
    for b in Blast(lastfile):
        print >> fw, "\t".join(
            (gene_name(b.query), gene_name(b.subject), str(int(b.score))))
    fw.close()

    # Symlink sbed
    symlink(sbedfile, op.basename(sbedfile))

    return anchorsfile, qbedfile, contig_to_beds
Exemple #3
0
def iadhore(args):
    """
    %prog iadhore athaliana.athaliana.last athaliana.bed

    Wrap around iADHoRe.
    """
    p = OptionParser(iadhore.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    lastfile = args[0]
    bedfiles = args[1:]
    blast_table = "blast_table.txt"
    fp = open(lastfile)
    seen = set()
    for row in fp:
        c = BlastLine(row)
        a, b = c.query, c.subject
        a, b = gene_name(a), gene_name(b)
        if a > b:
            a, b = b, a
        seen.add((a, b))

    fw = open(blast_table, "w")
    for a, b in seen:
        print >> fw, "\t".join((a, b))
    fw.close()
    logging.debug("A total of {0} pairs written to `{1}`"\
            .format(len(seen), blast_table))

    fw = open("config.txt", "w")
    for bedfile in bedfiles:
        pf, stanza = write_lst(bedfile)
        print >> fw, "genome={0}".format(pf)
        for seqid, fname in stanza:
            print >> fw,  " ".join((seqid, fname))
        print >> fw

    print >> fw, "blast_table={0}".format(blast_table)
    print >> fw, "cluster_type=colinear"
    print >> fw, "tandem_gap=10"
    print >> fw, "prob_cutoff=0.001"
    print >> fw, "gap_size=20"
    print >> fw, "cluster_gap=20"
    print >> fw, "q_value=0.9"
    print >> fw, "anchor_points=4"
    print >> fw, "alignment_method=gg2"
    print >> fw, "max_gaps_in_alignment=20"
    print >> fw, "output_path=i-adhore_out"
    print >> fw, "number_of_threads=4"
    fw.close()
Exemple #4
0
def iadhore(args):
    """
    %prog iadhore athaliana.athaliana.last athaliana.bed

    Wrap around iADHoRe.
    """
    p = OptionParser(iadhore.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    lastfile = args[0]
    bedfiles = args[1:]
    blast_table = "blast_table.txt"
    fp = open(lastfile)
    seen = set()
    for row in fp:
        c = BlastLine(row)
        a, b = c.query, c.subject
        a, b = gene_name(a), gene_name(b)
        if a > b:
            a, b = b, a
        seen.add((a, b))

    fw = open(blast_table, "w")
    for a, b in seen:
        print("\t".join((a, b)), file=fw)
    fw.close()
    logging.debug("A total of {0} pairs written to `{1}`"\
            .format(len(seen), blast_table))

    fw = open("config.txt", "w")
    for bedfile in bedfiles:
        pf, stanza = write_lst(bedfile)
        print("genome={0}".format(pf), file=fw)
        for seqid, fname in stanza:
            print(" ".join((seqid, fname)), file=fw)
        print(file=fw)

    print("blast_table={0}".format(blast_table), file=fw)
    print("cluster_type=colinear", file=fw)
    print("tandem_gap=10", file=fw)
    print("prob_cutoff=0.001", file=fw)
    print("gap_size=20", file=fw)
    print("cluster_gap=20", file=fw)
    print("q_value=0.9", file=fw)
    print("anchor_points=4", file=fw)
    print("alignment_method=gg2", file=fw)
    print("max_gaps_in_alignment=20", file=fw)
    print("output_path=i-adhore_out", file=fw)
    print("number_of_threads=4", file=fw)
    fw.close()
Exemple #5
0
 def __init__(self, row, strip_names=False):
     args = row.strip().split(",")
     self.name = args[0]
     self.yn_ks = self.get_float(args[1])
     self.yn_ka = self.get_float(args[2])
     self.ng_ks = self.get_float(args[3])
     self.ng_ka = self.get_float(args[4])
     self.ks = self.ng_ks
     if ";" in self.name:
         self.gene_a, self.gene_b = self.name.split(";")
     if strip_names:
         self.gene_a = gene_name(self.gene_a)
         self.gene_b = gene_name(self.gene_b)
Exemple #6
0
 def __init__(self, row, strip_names=False):
     args = row.strip().split(",")
     self.name = args[0]
     self.yn_ks = self.get_float(args[1])
     self.yn_ka = self.get_float(args[2])
     self.ng_ks = self.get_float(args[3])
     self.ng_ka = self.get_float(args[4])
     self.ks = self.ng_ks
     if ";" in self.name:
         self.gene_a, self.gene_b = self.name.split(";")
     if strip_names:
         self.gene_a = gene_name(self.gene_a)
         self.gene_b = gene_name(self.gene_b)
Exemple #7
0
def read_ks_file(ks_file, strip_names=False):
    from jcvi.utils.cbook import gene_name

    reader = csv.reader(open(ks_file, "rb"))
    data = []
    for row in reader:
        if row[0] == "name":  # header
            continue

        for i, a in enumerate(row):
            if i == 0:
                if strip_names:
                    row[i] = ";".join(gene_name(x) for x in row[i].split(";"))
                continue
            try:
                row[i] = float(row[i])
            except:
                row[i] = -1

        data.append(KsLine._make(row))

    logging.debug('File `{0}` contains a total of {1} gene pairs'.\
            format(ks_file, len(data)))

    return data
Exemple #8
0
def read_blast(blast_file, qorder, sorder, is_self=False, ostrip=True):
    """
    read the blast and convert name into coordinates
    """
    filtered_blast = []
    seen = set()
    bl = Blast(blast_file)
    for b in bl:
        query, subject = b.query, b.subject
        if query == subject:
            continue
        if ostrip:
            query, subject = gene_name(query), gene_name(subject)
        if query not in qorder or subject not in sorder:
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]

        if is_self:
            # remove redundant a<->b to one side when doing self-self BLAST
            if qi > si:
                query, subject = subject, query
                qi, si = si, qi
                q, s = s, q
            # Too close to diagonal! possible tandem repeats
            if q.seqid == s.seqid and si - qi < 40:
                continue

        key = query, subject
        if key in seen:
            continue
        seen.add(key)

        b.qseqid, b.sseqid = q.seqid, s.seqid
        b.qi, b.si = qi, si
        b.query, b.subject = query, subject

        filtered_blast.append(b)

    logging.debug("A total of {0} BLAST imported from `{1}`.".\
                  format(len(filtered_blast), blast_file))

    return filtered_blast
Exemple #9
0
def read_blast(blast_file, qorder, sorder, is_self=False, ostrip=True):
    """
    read the blast and convert name into coordinates
    """
    filtered_blast = []
    seen = set()
    bl = Blast(blast_file)
    for b in bl:
        query, subject = b.query, b.subject
        if query == subject:
            continue
        if ostrip:
            query, subject = gene_name(query), gene_name(subject)
        if query not in qorder or subject not in sorder:
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]

        if is_self:
            # remove redundant a<->b to one side when doing self-self BLAST
            if qi > si:
                query, subject = subject, query
                qi, si = si, qi
                q, s = s, q
            # Too close to diagonal! possible tandem repeats
            if q.seqid == s.seqid and si - qi < 40:
                continue

        key = query, subject
        if key in seen:
            continue
        seen.add(key)

        b.qseqid, b.sseqid = q.seqid, s.seqid
        b.qi, b.si = qi, si
        b.query, b.subject = query, subject

        filtered_blast.append(b)

    logging.debug("A total of {0} BLAST imported from `{1}`.".\
                  format(len(filtered_blast), blast_file))

    return filtered_blast
Exemple #10
0
def main(blast_file, cds_file, bed_file, N=3):

    # get the sizes for the CDS first
    f = Fasta(cds_file)
    sizes = dict(f.itersizes())

    # retrieve the locations
    bed = Bed(bed_file).order

    # filter the blast file
    g = Grouper()
    fp = open(blast_file)
    for row in fp:
        b = BlastLine(row)
        query_len = sizes[b.query]
        subject_len = sizes[b.subject]
        if b.hitlen < min(query_len, subject_len) / 2:
            continue

        query, subject = gene_name(b.query), gene_name(b.subject)
        qi, q = bed[query]
        si, s = bed[subject]

        if q.seqid == s.seqid and abs(qi - si) <= N:
            g.join(query, subject)

    # dump the grouper
    ngenes, nfamilies = 0, 0
    families = []
    for group in sorted(g):
        if len(group) >= 2:
            print ",".join(sorted(group))
            ngenes += len(group)
            nfamilies += 1
            families.append(sorted(group))

    longest_family = max(families, key=lambda x: len(x))

    # generate reports
    print >> sys.stderr, "Proximal paralogues (dist=%d):" % N
    print >> sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies)
    print >> sys.stderr, "Longest families (%d): %s" % (
        len(longest_family), ",".join(longest_family))
Exemple #11
0
def main(blast_file, cds_file, bed_file, N=3):

    # get the sizes for the CDS first
    f = Fasta(cds_file)
    sizes = dict(f.itersizes())

    # retrieve the locations
    bed = Bed(bed_file).order

    # filter the blast file
    g = Grouper()
    fp = open(blast_file)
    for row in fp:
        b = BlastLine(row)
        query_len = sizes[b.query]
        subject_len = sizes[b.subject]
        if b.hitlen < min(query_len, subject_len) / 2:
            continue

        query, subject = gene_name(b.query), gene_name(b.subject)
        qi, q = bed[query]
        si, s = bed[subject]

        if q.seqid == s.seqid and abs(qi - si) <= N:
            g.join(query, subject)

    # dump the grouper
    ngenes, nfamilies = 0, 0
    families = []
    for group in sorted(g):
        if len(group) >= 2:
            print ",".join(sorted(group))
            ngenes += len(group)
            nfamilies += 1
            families.append(sorted(group))

    longest_family = max(families, key=lambda x: len(x))

    # generate reports
    print >>sys.stderr, "Proximal paralogues (dist=%d):" % N
    print >>sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies)
    print >>sys.stderr, "Longest families (%d): %s" % (len(longest_family),
        ",".join(longest_family))
Exemple #12
0
def read_blast(blast_file, qorder, sorder, is_self=False, ostrip=True):
    """
    read the blast and convert name into coordinates
    """
    fp = open(blast_file)
    filtered_blast = []
    seen = set()
    for row in fp:
        b = BlastLine(row)
        query, subject = b.query, b.subject
        if query == subject:
            continue
        if ostrip:
            query, subject = gene_name(query), gene_name(subject)
        if query not in qorder or subject not in sorder:
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]

        if is_self and qi > si:
            # remove redundant a<->b to one side when doing self-self BLAST
            query, subject = subject, query
            qi, si = si, qi
            q, s = s, q

        key = query, subject
        if key in seen:
            continue
        seen.add(key)

        b.qseqid, b.sseqid = q.seqid, s.seqid
        b.qi, b.si = qi, si
        b.query, b.subject = query, subject

        filtered_blast.append(b)

    logging.debug("A total of {0} BLAST imported from `{1}`.".\
                  format(len(filtered_blast), blast_file))

    return filtered_blast
Exemple #13
0
def read_blast(blast_file, qorder, sorder, is_self=False, ostrip=True):
    """
    read the blast and convert name into coordinates
    """
    fp = open(blast_file)
    filtered_blast = []
    seen = set()
    for row in fp:
        b = BlastLine(row)
        query, subject = b.query, b.subject
        if query == subject:
            continue
        if ostrip:
            query, subject = gene_name(query), gene_name(subject)
        if query not in qorder or subject not in sorder:
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]

        if is_self and qi > si:
            # remove redundant a<->b to one side when doing self-self BLAST
            query, subject = subject, query
            qi, si = si, qi
            q, s = s, q

        key = query, subject
        if key in seen:
            continue
        seen.add(key)

        b.qseqid, b.sseqid = q.seqid, s.seqid
        b.qi, b.si = qi, si
        b.query, b.subject = query, subject

        filtered_blast.append(b)

    logging.debug("A total of {0} BLAST imported from `{1}`.".\
                  format(len(filtered_blast), blast_file))

    return filtered_blast
Exemple #14
0
def some(args):
    """
    %prog some bedfile idsfile > newbedfile

    Retrieve a subset of bed features given a list of ids.
    """
    from jcvi.formats.base import SetFile
    from jcvi.utils.cbook import gene_name

    p = OptionParser(some.__doc__)
    p.add_option("-v",
                 dest="inverse",
                 default=False,
                 action="store_true",
                 help="Get the inverse, like grep -v [default: %default]")
    p.set_outfile()
    p.set_stripnames()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, idsfile = args
    inverse = opts.inverse
    ostrip = opts.strip_names
    fw = must_open(opts.outfile, "w")

    ids = SetFile(idsfile)
    if ostrip:
        ids = set(gene_name(x) for x in ids)
    bed = Bed(bedfile)
    ntotal = nkeep = 0
    for b in bed:
        ntotal += 1
        keep = b.accn in ids
        if inverse:
            keep = not keep

        if keep:
            nkeep += 1
            print >> fw, b

    fw.close()
    logging.debug("Stats: {0} features kept.".\
                    format(percentage(nkeep, ntotal)))
Exemple #15
0
def some(args):
    """
    %prog some bedfile idsfile > newbedfile

    Retrieve a subset of bed features given a list of ids.
    """
    from jcvi.formats.base import SetFile
    from jcvi.utils.cbook import gene_name

    p = OptionParser(some.__doc__)
    p.add_option("-v", dest="inverse", default=False, action="store_true",
                 help="Get the inverse, like grep -v [default: %default]")
    p.set_outfile()
    p.set_stripnames()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, idsfile = args
    inverse = opts.inverse
    ostrip = opts.strip_names
    fw = must_open(opts.outfile, "w")

    ids = SetFile(idsfile)
    if ostrip:
        ids = set(gene_name(x) for x in ids)
    bed = Bed(bedfile)
    ntotal = nkeep = 0
    for b in bed:
        ntotal += 1
        keep = b.accn in ids
        if inverse:
            keep = not keep

        if keep:
            nkeep += 1
            print >> fw, b

    fw.close()
    logging.debug("Stats: {0} features kept.".\
                    format(percentage(nkeep, ntotal)))
Exemple #16
0
 def anchorline(self):
     return "\t".join((gene_name(self.gene_a), gene_name(self.gene_b),
                       "{:.3f}".format(self.ks)))
Exemple #17
0
def benchmark(args):
    """
    %prog benchmark at bedfile

    Compare SynFind, MCScanx, iADHoRe and OrthoFinder against the truth.
    """
    p = OptionParser(benchmark.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    pf, bedfile = args
    truth = pf + ".truth"
    synfind = pf + ".synfind"
    mcscanx = pf + ".mcscanx"
    iadhore = pf + ".iadhore"
    orthofinder = pf + ".orthofinder"
    pivots = set([x.accn for x in Bed(bedfile)])

    fp = open(truth)
    truth = set()
    for row in fp:
        a, b = row.strip().split("\t")[:2]
        pivots.add(a)
        truth.add(tuple(sorted((a, b))))
    logging.debug("Truth: {0} pairs".format(len(truth)))

    fp = open(synfind)
    benchmarkfile = pf + ".benchmark"
    fw = must_open(benchmarkfile, "w")
    synfind = set()
    for row in fp:
        atoms = row.strip().split("\t")
        query, hit, tag = atoms[:3]
        if tag != "S":
            continue
        synfind.add(tuple(sorted((query, hit))))
    calc_sensitivity_specificity(synfind, truth, "SynFind", fw)

    fp = open(mcscanx)
    mcscanx = set()
    for row in fp:
        if row[0] == '#':
            continue
        atoms = row.strip().split(":")[1].split()
        query, hit = atoms[:2]
        mcscanx.add(tuple(sorted((query, hit))))
    calc_sensitivity_specificity(mcscanx, truth, "MCScanX", fw)

    fp = open(iadhore)
    iadhore = set()
    next(fp)
    for row in fp:
        atoms = row.strip().split("\t")
        query, hit = atoms[3:5]
        iadhore.add(tuple(sorted((query, hit))))
    calc_sensitivity_specificity(iadhore, truth, "iADHoRe", fw)

    fp = open(orthofinder)
    orthofinder = set()
    next(fp)
    for row in fp:
        row = row.replace('"', "")
        atoms = row.replace(",", " ").split()
        genes = [x.strip() for x in atoms if not x.startswith("OG")]
        genes = [gene_name(x) for x in genes]
        pps = [x for x in genes if x in pivots]
        for p in pps:
            for g in genes:
                if p == g:
                    continue
                orthofinder.add(tuple(sorted((p, g))))
    #write_pairs(orthofinder, "orthofinder.pairs")
    calc_sensitivity_specificity(orthofinder, truth, "OrthoFinder", fw)
    fw.close()
Exemple #18
0
def tandem_main(blast_file, cds_file, bed_file, N=3, P=50, is_self=True, \
    evalue=.01, strip_name=".", ofile=sys.stderr, genefam=False):

    if genefam:
        N = 1e5

    # get the sizes for the CDS first
    f = Fasta(cds_file)
    sizes = dict(f.itersizes())

    # retrieve the locations
    bed = Bed(bed_file)
    order = bed.order

    if is_self:
        # filter the blast file
        g = Grouper()
        fp = open(blast_file)
        for row in fp:
            b = BlastLine(row)
            query_len = sizes[b.query]
            subject_len = sizes[b.subject]
            if b.hitlen < min(query_len, subject_len)*P/100.:
                continue

            query = gene_name(b.query, strip_name)
            subject = gene_name(b.subject, strip_name)
            qi, q = order[query]
            si, s = order[subject]

            if abs(qi - si) <= N and b.evalue <= evalue:
                if genefam:
                    g.join(query, subject)
                elif q.seqid == s.seqid:
                    g.join(query, subject)

    else:
        homologs = Grouper()
        fp = open(blast_file)
        for row in fp:
            b = BlastLine(row)
            query_len = sizes[b.query]
            subject_len = sizes[b.subject]
            if b.hitlen < min(query_len, subject_len)*P/100.:
                continue
            if b.evalue > evalue:
                continue

            query = gene_name(b.query, strip_name)
            subject = gene_name(b.subject, strip_name)
            homologs.join(query, subject)

        if genefam:
            g = homologs
        else:
            g = Grouper()
            for i, atom in enumerate(bed):
                for x in range(1, N+1):
                    if all([i-x >= 0, bed[i-x].seqid == atom.seqid, \
                        homologs.joined(bed[i-x].accn, atom.accn)]):
                        leni = sizes[bed[i].accn]
                        lenx = sizes[bed[i-x].accn]
                        if abs(leni - lenx) > max(leni, lenx)*(1-P/100.):
                            continue
                        g.join(bed[i-x].accn, atom.accn)

    # dump the grouper
    fw = must_open(ofile, "w")
    ngenes, nfamilies = 0, 0
    families = []
    for group in sorted(g):
        if len(group) >= 2:
            print >>fw, ",".join(sorted(group))
            ngenes += len(group)
            nfamilies += 1
            families.append(sorted(group))

    longest_family = max(families, key=lambda x: len(x))

    # generate reports
    print >>sys.stderr, "Proximal paralogues (dist=%d):" % N
    print >>sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies)
    print >>sys.stderr, "Longest families (%d): %s" % (len(longest_family),
        ",".join(longest_family))

    return families
Exemple #19
0
def test_gene_name(input, output):
    from jcvi.utils.cbook import gene_name

    assert gene_name(input) == output
Exemple #20
0
def loss(args):
    """
    %prog loss a.b.i1.blocks [a.b-genomic.blast]

    Extract likely gene loss candidates between genome a and b.
    """
    p = OptionParser(loss.__doc__)
    p.add_option("--bed", default=False, action="store_true",
                 help="Genomic BLAST is in bed format [default: %default]")
    p.add_option("--gdist", default=20, type="int",
                 help="Gene distance [default: %default]")
    p.add_option("--bdist", default=20000, type="int",
                 help="Base pair distance [default: %default]")
    p.set_beds()
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    blocksfile = args[0]
    emptyblast = (len(args) == 1)
    if emptyblast:
        genomicblast = "empty.blast"
        sh("touch {0}".format(genomicblast))
    else:
        genomicblast = args[1]

    gdist, bdist = opts.gdist, opts.bdist
    qbed, sbed, qorder, sorder, is_self = check_beds(blocksfile, p, opts)
    blocks = []
    fp = open(blocksfile)
    genetrack = {}
    proxytrack = {}
    for row in fp:
        a, b = row.split()
        genetrack[a] = b
        blocks.append((a, b))

    data = []
    for key, rows in groupby(blocks, key=lambda x: x[-1]):
        rows = list(rows)
        data.append((key, rows))

    imax = len(data) - 1
    for i, (key, rows) in enumerate(data):
        if i == 0 or i == imax:
            continue
        if key != '.':
            continue

        before, br = data[i - 1]
        after, ar = data[i + 1]
        bi, bx = sorder[before]
        ai, ax = sorder[after]
        dist = abs(bi - ai)
        if bx.seqid != ax.seqid or dist > gdist:
            continue

        start, end = range_minmax(((bx.start, bx.end), (ax.start, ax.end)))
        start, end = max(start - bdist, 1), end + bdist
        proxy = (bx.seqid, start, end)
        for a, b in rows:
            proxytrack[a] = proxy

    tags = {}
    if opts.bed:
        bed = Bed(genomicblast, sorted=False)
        key = lambda x: gene_name(x.accn.rsplit(".", 1)[0])
        for query, bb in groupby(bed, key=key):
            bb = list(bb)
            if query not in proxytrack:
                continue

            proxy = proxytrack[query]
            tag = "NS"
            best_b = bb[0]
            for b in bb:
                hsp = (b.seqid, b.start, b.end)
                if range_overlap(proxy, hsp):
                    tag = "S"
                    best_b = b
                    break

            hsp = (best_b.seqid, best_b.start, best_b.end)
            proxytrack[query] = hsp
            tags[query] = tag

    else:
        blast = Blast(genomicblast)
        for query, bb in blast.iter_hits():
            bb = list(bb)
            query = gene_name(query)
            if query not in proxytrack:
                continue

            proxy = proxytrack[query]
            tag = "NS"
            best_b = bb[0]
            for b in bb:
                hsp = (b.subject, b.sstart, b.sstop)
                if range_overlap(proxy, hsp):
                    tag = "S"
                    best_b = b
                    break

            hsp = (best_b.subject, best_b.sstart, best_b.sstop)
            proxytrack[query] = hsp
            tags[query] = tag

    for b in qbed:
        accn = b.accn
        target_region = genetrack[accn]
        if accn in proxytrack:
            target_region = region_str(proxytrack[accn])
            if accn in tags:
                ptag = "[{0}]".format(tags[accn])
            else:
                ptag = "[NF]"
            target_region = ptag + target_region

        print "\t".join((b.seqid, accn, target_region))

    if emptyblast:
        sh("rm -f {0}".format(genomicblast))
Exemple #21
0
def pastegenes(args):
    """
    %prog pastegenes coverage.list old.genes.bed new.genes.bed old.assembly

    Paste in zero or low coverage genes.  For a set of neighboring genes
    missing, add the whole cassette as unplaced scaffolds. For singletons the
    program will try to make a patch.
    """
    from jcvi.formats.base import DictFile
    from jcvi.utils.cbook import gene_name

    p = OptionParser(pastegenes.__doc__)
    p.add_option(
        "--cutoff",
        default=90,
        type="int",
        help="Coverage cutoff to call gene missing",
    )
    p.add_option(
        "--flank",
        default=2000,
        type="int",
        help="Get the seq of size on two ends",
    )
    p.add_option(
        "--maxsize",
        default=50000,
        type="int",
        help="Maximum size of patchers to be replaced",
    )
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    coveragefile, oldbed, newbed, oldassembly = args
    cutoff = opts.cutoff
    flank = opts.flank
    maxsize = opts.maxsize

    coverage = DictFile(coveragefile, valuepos=2, cast=float)

    obed = Bed(oldbed)
    order = obed.order
    bed = [x for x in obed if x.accn in coverage]
    key = lambda x: coverage[x.accn] >= cutoff

    extrabed = "extra.bed"
    extendbed = "extend.bed"
    pastebed = "paste.bed"

    fw = open(extrabed, "w")
    fwe = open(extendbed, "w")
    fwp = open(pastebed, "w")
    fw_ids = open(extendbed + ".ids", "w")

    singletons, large, large_genes = 0, 0, 0
    for chr, chrbed in groupby(bed, key=lambda x: x.seqid):
        chrbed = list(chrbed)
        for good, beds in groupby(chrbed, key=key):
            if good:
                continue

            beds = list(beds)
            blocksize = len(set([gene_name(x.accn) for x in beds]))
            if blocksize == 1:
                singletons += 1
                accn = beds[0].accn
                gi, gb = order[accn]
                leftb = obed[gi - 1]
                rightb = obed[gi + 1]
                leftr = leftb.range
                rightr = rightb.range
                cur = gb.range
                distance_to_left, oo = range_distance(leftr, cur)
                distance_to_right, oo = range_distance(cur, rightr)
                span, oo = range_distance(leftr, rightr)

                if distance_to_left <= distance_to_right and distance_to_left > 0:
                    label = "LEFT"
                else:
                    label = "RIGHT"

                if 0 < span <= maxsize:
                    print(
                        "\t".join(
                            str(x) for x in (chr, leftb.start, rightb.end, gb.accn)
                        ),
                        file=fwp,
                    )

                print(leftb, file=fwe)
                print(gb, file=fwe)
                print(rightb, file=fwe)
                print(
                    "L:{0} R:{1} [{2}]".format(
                        distance_to_left, distance_to_right, label
                    ),
                    file=fwe,
                )
                print(gb.accn, file=fw_ids)
                continue

            large += 1
            large_genes += blocksize

            ranges = [(x.start, x.end) for x in beds]
            rmin, rmax = range_minmax(ranges)
            rmin -= flank
            rmax += flank

            name = "-".join((beds[0].accn, beds[-1].accn))
            print("\t".join(str(x) for x in (chr, rmin - 1, rmax, name)), file=fw)

    fw.close()
    fwe.close()

    extrabed = mergeBed(extrabed, d=flank, nms=True)
    fastaFromBed(extrabed, oldassembly, name=True)
    summary([extrabed])

    logging.debug("Singleton blocks : {0}".format(singletons))
    logging.debug("Large blocks : {0} ({1} genes)".format(large, large_genes))
Exemple #22
0
def pastegenes(args):
    """
    %prog pastegenes coverage.list old.genes.bed new.genes.bed old.assembly

    Paste in zero or low coverage genes.  For a set of neighboring genes
    missing, add the whole cassette as unplaced scaffolds. For singletons the
    program will try to make a patch.
    """
    from jcvi.formats.base import DictFile
    from jcvi.utils.cbook import gene_name

    p = OptionParser(pastegenes.__doc__)
    p.add_option("--cutoff", default=90, type="int",
                 help="Coverage cutoff to call gene missing [default: %default]")
    p.add_option("--flank", default=2000, type="int",
                 help="Get the seq of size on two ends [default: %default]")
    p.add_option("--maxsize", default=50000, type="int",
            help="Maximum size of patchers to be replaced [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    coveragefile, oldbed, newbed, oldassembly = args
    cutoff = opts.cutoff
    flank = opts.flank
    maxsize = opts.maxsize

    coverage = DictFile(coveragefile, valuepos=2, cast=float)

    obed = Bed(oldbed)
    order = obed.order
    bed = [x for x in obed if x.accn in coverage]
    key = lambda x: coverage[x.accn] >= cutoff

    extrabed = "extra.bed"
    extendbed = "extend.bed"
    pastebed = "paste.bed"

    fw = open(extrabed, "w")
    fwe = open(extendbed, "w")
    fwp = open(pastebed, "w")
    fw_ids = open(extendbed + ".ids", "w")

    singletons, large, large_genes = 0, 0, 0
    for chr, chrbed in groupby(bed, key=lambda x: x.seqid):
        chrbed = list(chrbed)
        for good, beds in groupby(chrbed, key=key):
            if good:
                continue

            beds = list(beds)
            blocksize = len(set([gene_name(x.accn) for x in beds]))
            if blocksize == 1:
                singletons += 1
                accn = beds[0].accn
                gi, gb = order[accn]
                leftb = obed[gi - 1]
                rightb = obed[gi + 1]
                leftr = leftb.range
                rightr = rightb.range
                cur = gb.range
                distance_to_left, oo = range_distance(leftr, cur)
                distance_to_right, oo = range_distance(cur, rightr)
                span, oo = range_distance(leftr, rightr)

                if distance_to_left <= distance_to_right and \
                   distance_to_left > 0:
                    label = "LEFT"
                else:
                    label = "RIGHT"

                if 0 < span <= maxsize:
                    print >> fwp, "\t".join(str(x) for x in \
                                    (chr, leftb.start, rightb.end, gb.accn))

                print >> fwe, leftb
                print >> fwe, gb
                print >> fwe, rightb
                print >> fwe, "L:{0} R:{1} [{2}]".format(distance_to_left, \
                            distance_to_right, label)
                print >> fw_ids, gb.accn
                continue

            large += 1
            large_genes += blocksize

            ranges = [(x.start, x.end) for x in beds]
            rmin, rmax = range_minmax(ranges)
            rmin -= flank
            rmax += flank

            name = "-".join((beds[0].accn, beds[-1].accn))
            print >> fw, "\t".join(str(x) for x in (chr, rmin - 1, rmax, name))

    fw.close()
    fwe.close()

    extrabed = mergeBed(extrabed, d=flank, nms=True)
    fastaFromBed(extrabed, oldassembly, name=True)
    summary([extrabed])

    logging.debug("Singleton blocks : {0}".format(singletons))
    logging.debug("Large blocks : {0} ({1} genes)".format(large, large_genes))
Exemple #23
0
def blastfilter_main(blast_file, p, opts):

    qbed, sbed, qorder, sorder, is_self = check_beds(blast_file, p, opts)

    tandem_Nmax = opts.tandem_Nmax
    cscore = opts.cscore

    fp = open(blast_file)
    total_lines = sum(1 for line in fp if line[0] != '#')
    logging.debug("Load BLAST file `%s` (total %d lines)" % \
            (blast_file, total_lines))
    bl = Blast(blast_file)
    blasts = sorted(list(bl), key=lambda b: b.score, reverse=True)

    filtered_blasts = []
    seen = set()
    ostrip = opts.strip_names
    nwarnings = 0
    for b in blasts:
        query, subject = b.query, b.subject
        if query == subject:
            continue

        if ostrip:
            query, subject = gene_name(query), gene_name(subject)
        if query not in qorder:
            if nwarnings < 100:
                logging.warning("{0} not in {1}".format(query,
                    qbed.filename))
            elif nwarnings == 100:
                logging.warning("too many warnings.. suppressed")
            nwarnings += 1
            continue
        if subject not in sorder:
            if nwarnings < 100:
                logging.warning("{0} not in {1}".format(subject,
                    sbed.filename))
            elif nwarnings == 100:
                logging.warning("too many warnings.. suppressed")
            nwarnings += 1
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]

        if is_self and qi > si:
            # move all hits to same side when doing self-self BLAST
            query, subject = subject, query
            qi, si = si, qi
            q, s = s, q

        key = query, subject
        if key in seen:
            continue
        seen.add(key)
        b.query, b.subject = key

        b.qi, b.si = qi, si
        b.qseqid, b.sseqid = q.seqid, s.seqid

        filtered_blasts.append(b)

    if cscore:
        before_filter = len(filtered_blasts)
        logging.debug("running the cscore filter (cscore>=%.2f) .." % cscore)
        filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore))
        logging.debug("after filter (%d->%d) .." % (before_filter,
            len(filtered_blasts)))

    if tandem_Nmax:
        logging.debug("running the local dups filter (tandem_Nmax=%d) .." % \
                tandem_Nmax)

        qtandems = tandem_grouper(qbed, filtered_blasts,
                flip=True, tandem_Nmax=tandem_Nmax)
        standems = tandem_grouper(sbed, filtered_blasts,
                flip=False, tandem_Nmax=tandem_Nmax)

        qdups_fh = open(op.splitext(opts.qbed)[0] + ".localdups", "w") \
                if opts.tandems_only else None

        if is_self:
            for s in standems:
                qtandems.join(*s)
            qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh)
            sdups_to_mother = qdups_to_mother
        else:
            qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh)
            sdups_fh = open(op.splitext(opts.sbed)[0] + ".localdups", "w") \
                    if opts.tandems_only else None
            sdups_to_mother = write_localdups(standems, sbed, sdups_fh)

        if opts.tandems_only:
            # write out new .bed after tandem removal
            write_new_bed(qbed, qdups_to_mother)
            if not is_self:
                write_new_bed(sbed, sdups_to_mother)

            # just want to use this script as a tandem finder.
            #sys.exit()

        before_filter = len(filtered_blasts)
        filtered_blasts = list(filter_tandem(filtered_blasts, \
                qdups_to_mother, sdups_to_mother))
        logging.debug("after filter (%d->%d) .." % \
                (before_filter, len(filtered_blasts)))

    blastfilteredfile = blast_file + ".filtered"
    fw = open(blastfilteredfile, "w")
    write_new_blast(filtered_blasts, fh=fw)
    fw.close()
Exemple #24
0
def benchmark(args):
    """
    %prog benchmark at bedfile

    Compare SynFind, MCScanx, iADHoRe and OrthoFinder against the truth.
    """
    p = OptionParser(benchmark.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    pf, bedfile = args
    truth = pf + ".truth"
    synfind = pf + ".synfind"
    mcscanx = pf + ".mcscanx"
    iadhore = pf + ".iadhore"
    orthofinder = pf + ".orthofinder"
    pivots = set([x.accn for x in Bed(bedfile)])

    fp = open(truth)
    truth = set()
    for row in fp:
        a, b = row.strip().split("\t")[:2]
        pivots.add(a)
        truth.add(tuple(sorted((a, b))))
    logging.debug("Truth: {0} pairs".format(len(truth)))

    fp = open(synfind)
    benchmarkfile = pf + ".benchmark"
    fw = must_open(benchmarkfile, "w")
    synfind = set()
    for row in fp:
        atoms = row.strip().split("\t")
        query, hit, tag = atoms[:3]
        if tag != "S":
            continue
        synfind.add(tuple(sorted((query, hit))))
    calc_sensitivity_specificity(synfind, truth, "SynFind", fw)

    fp = open(mcscanx)
    mcscanx = set()
    for row in fp:
        if row[0] == '#':
            continue
        atoms = row.strip().split(":")[1].split()
        query, hit = atoms[:2]
        mcscanx.add(tuple(sorted((query, hit))))
    calc_sensitivity_specificity(mcscanx, truth, "MCScanX", fw)

    fp = open(iadhore)
    iadhore = set()
    fp.next()
    for row in fp:
        atoms = row.strip().split("\t")
        query, hit = atoms[3:5]
        iadhore.add(tuple(sorted((query, hit))))
    calc_sensitivity_specificity(iadhore, truth, "iADHoRe", fw)

    fp = open(orthofinder)
    orthofinder = set()
    fp.next()
    for row in fp:
        row = row.replace('"', "")
        atoms = row.replace(",", " ").split()
        genes = [x.strip() for x in atoms if not x.startswith("OG")]
        genes = [gene_name(x) for x in genes]
        pps = [x for x in genes if x in pivots]
        for p in pps:
            for g in genes:
                if p == g:
                    continue
                orthofinder.add(tuple(sorted((p, g))))
    #write_pairs(orthofinder, "orthofinder.pairs")
    calc_sensitivity_specificity(orthofinder, truth, "OrthoFinder", fw)
    fw.close()
Exemple #25
0
def tandem_main(blast_file, cds_file, bed_file, N=3, P=50, is_self=True, \
    evalue=.01, strip_name=".", ofile=sys.stderr, genefam=False):

    if genefam:
        N = 1e5

    # get the sizes for the CDS first
    f = Fasta(cds_file)
    sizes = dict(f.itersizes())

    # retrieve the locations
    bed = Bed(bed_file)
    order = bed.order

    if is_self:
        # filter the blast file
        g = Grouper()
        fp = open(blast_file)
        for row in fp:
            b = BlastLine(row)
            query_len = sizes[b.query]
            subject_len = sizes[b.subject]
            if b.hitlen < min(query_len, subject_len) * P / 100.:
                continue

            query = gene_name(b.query, strip_name)
            subject = gene_name(b.subject, strip_name)
            qi, q = order[query]
            si, s = order[subject]

            if abs(qi - si) <= N and b.evalue <= evalue:
                if genefam:
                    g.join(query, subject)
                elif q.seqid == s.seqid:
                    g.join(query, subject)

    else:
        homologs = Grouper()
        fp = open(blast_file)
        for row in fp:
            b = BlastLine(row)
            query_len = sizes[b.query]
            subject_len = sizes[b.subject]
            if b.hitlen < min(query_len, subject_len) * P / 100.:
                continue
            if b.evalue > evalue:
                continue

            query = gene_name(b.query, strip_name)
            subject = gene_name(b.subject, strip_name)
            homologs.join(query, subject)

        if genefam:
            g = homologs
        else:
            g = Grouper()
            for i, atom in enumerate(bed):
                for x in range(1, N + 1):
                    if all([i-x >= 0, bed[i-x].seqid == atom.seqid, \
                        homologs.joined(bed[i-x].accn, atom.accn)]):
                        leni = sizes[bed[i].accn]
                        lenx = sizes[bed[i - x].accn]
                        if abs(leni - lenx) > max(leni, lenx) * (1 - P / 100.):
                            continue
                        g.join(bed[i - x].accn, atom.accn)

    # dump the grouper
    fw = must_open(ofile, "w")
    ngenes, nfamilies = 0, 0
    families = []
    for group in sorted(g):
        if len(group) >= 2:
            print >> fw, ",".join(sorted(group))
            ngenes += len(group)
            nfamilies += 1
            families.append(sorted(group))

    longest_family = max(families, key=lambda x: len(x))

    # generate reports
    print >> sys.stderr, "Proximal paralogues (dist=%d):" % N
    print >> sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies)
    print >> sys.stderr, "Longest families (%d): %s" % (
        len(longest_family), ",".join(longest_family))

    return families
Exemple #26
0
def cscore(args):
    """
    %prog cscore blastfile > cscoreOut

    See supplementary info for sea anemone genome paper, C-score formula:

        cscore(A,B) = score(A,B) /
             max(best score for A, best score for B)

    A C-score of one is the same as reciprocal best hit (RBH).

    Output file will be 3-column (query, subject, cscore). Use --cutoff to
    select a different cutoff.
    """
    from jcvi.utils.cbook import gene_name

    p = OptionParser(cscore.__doc__)
    p.add_option("--cutoff",
                 default=.9999,
                 type="float",
                 help="Minimum C-score to report [default: %default]")
    p.add_option("--pct",
                 default=False,
                 action="store_true",
                 help="Also include pct as last column [default: %default]")
    p.add_option("--writeblast",
                 default=False,
                 action="store_true",
                 help="Also write filtered blast file [default: %default]")
    p.set_stripnames()
    p.set_outfile()

    opts, args = p.parse_args(args)
    ostrip = opts.strip_names
    writeblast = opts.writeblast
    outfile = opts.outfile

    if len(args) != 1:
        sys.exit(not p.print_help())

    blastfile, = args

    blast = Blast(blastfile)
    logging.debug("Register best scores ..")
    best_score = defaultdict(float)
    for b in blast:
        query, subject = b.query, b.subject
        if ostrip:
            query, subject = gene_name(query), gene_name(subject)

        score = b.score
        if score > best_score[query]:
            best_score[query] = score
        if score > best_score[subject]:
            best_score[subject] = score

    blast = Blast(blastfile)
    pairs = {}
    cutoff = opts.cutoff
    for b in blast:
        query, subject = b.query, b.subject
        if ostrip:
            query, subject = gene_name(query), gene_name(subject)

        score = b.score
        pctid = b.pctid
        s = score / max(best_score[query], best_score[subject])
        if s > cutoff:
            pair = (query, subject)
            if pair not in pairs or s > pairs[pair][0]:
                pairs[pair] = (s, pctid, b)

    fw = must_open(outfile, "w")
    if writeblast:
        fwb = must_open(outfile + ".filtered.blast", "w")
    pct = opts.pct
    for (query, subject), (s, pctid, b) in sorted(pairs.items()):
        args = [query, subject, "{0:.2f}".format(s)]
        if pct:
            args.append("{0:.1f}".format(pctid))
        print >> fw, "\t".join(args)
        if writeblast:
            print >> fwb, b
    fw.close()
    if writeblast:
        fwb.close()
Exemple #27
0
def cscore(args):
    """
    %prog cscore blastfile > cscoreOut

    See supplementary info for sea anemone genome paper, C-score formula:

        cscore(A,B) = score(A,B) /
             max(best score for A, best score for B)

    A C-score of one is the same as reciprocal best hit (RBH).

    Output file will be 3-column (query, subject, cscore). Use --cutoff to
    select a different cutoff.
    """
    from jcvi.utils.cbook import gene_name

    p = OptionParser(cscore.__doc__)
    p.add_option("--cutoff", default=.9999, type="float",
            help="Minimum C-score to report [default: %default]")
    p.add_option("--pct", default=False, action="store_true",
            help="Also include pct as last column [default: %default]")
    p.add_option("--writeblast", default=False, action="store_true",
            help="Also write filtered blast file [default: %default]")
    p.set_stripnames()
    p.set_outfile()

    opts, args = p.parse_args(args)
    ostrip = opts.strip_names
    writeblast = opts.writeblast
    outfile = opts.outfile

    if len(args) != 1:
        sys.exit(not p.print_help())

    blastfile, = args

    blast = Blast(blastfile)
    logging.debug("Register best scores ..")
    best_score = defaultdict(float)
    for b in blast:
        query, subject = b.query, b.subject
        if ostrip:
            query, subject = gene_name(query), gene_name(subject)

        score = b.score
        if score > best_score[query]:
            best_score[query] = score
        if score > best_score[subject]:
            best_score[subject] = score

    blast = Blast(blastfile)
    pairs = {}
    cutoff = opts.cutoff
    for b in blast:
        query, subject = b.query, b.subject
        if ostrip:
            query, subject = gene_name(query), gene_name(subject)

        score = b.score
        pctid = b.pctid
        s = score / max(best_score[query], best_score[subject])
        if s > cutoff:
            pair = (query, subject)
            if pair not in pairs or s > pairs[pair][0]:
                pairs[pair] = (s, pctid, b)

    fw = must_open(outfile, "w")
    if writeblast:
        fwb = must_open(outfile + ".filtered.blast", "w")
    pct = opts.pct
    for (query, subject), (s, pctid, b) in sorted(pairs.items()):
        args = [query, subject, "{0:.2f}".format(s)]
        if pct:
            args.append("{0:.1f}".format(pctid))
        print >> fw, "\t".join(args)
        if writeblast:
            print >> fwb, b
    fw.close()
    if writeblast:
        fwb.close()
Exemple #28
0
def blastfilter_main(blast_file, p, opts):

    qbed, sbed, qorder, sorder, is_self = check_beds(blast_file, p, opts)

    tandem_Nmax = opts.tandem_Nmax
    cscore = opts.cscore

    fp = open(blast_file)
    total_lines = sum(1 for line in fp if line[0] != '#')
    logging.debug("Load BLAST file `%s` (total %d lines)" % \
            (blast_file, total_lines))
    bl = Blast(blast_file)
    blasts = sorted(list(bl), key=lambda b: b.score, reverse=True)

    filtered_blasts = []
    seen = set()
    ostrip = opts.strip_names
    nwarnings = 0
    for b in blasts:
        query, subject = b.query, b.subject
        if query == subject:
            continue

        if ostrip:
            query, subject = gene_name(query), gene_name(subject)
        if query not in qorder:
            if nwarnings < 100:
                logging.warning("{0} not in {1}".format(query, qbed.filename))
            elif nwarnings == 100:
                logging.warning("too many warnings.. suppressed")
            nwarnings += 1
            continue
        if subject not in sorder:
            if nwarnings < 100:
                logging.warning("{0} not in {1}".format(
                    subject, sbed.filename))
            elif nwarnings == 100:
                logging.warning("too many warnings.. suppressed")
            nwarnings += 1
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]

        if is_self and qi > si:
            # move all hits to same side when doing self-self BLAST
            query, subject = subject, query
            qi, si = si, qi
            q, s = s, q

        key = query, subject
        if key in seen:
            continue
        seen.add(key)
        b.query, b.subject = key

        b.qi, b.si = qi, si
        b.qseqid, b.sseqid = q.seqid, s.seqid

        filtered_blasts.append(b)

    if cscore:
        before_filter = len(filtered_blasts)
        logging.debug("running the cscore filter (cscore>=%.2f) .." % cscore)
        filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore))
        logging.debug("after filter (%d->%d) .." %
                      (before_filter, len(filtered_blasts)))

    if tandem_Nmax:
        logging.debug("running the local dups filter (tandem_Nmax=%d) .." % \
                tandem_Nmax)

        qtandems = tandem_grouper(qbed,
                                  filtered_blasts,
                                  flip=True,
                                  tandem_Nmax=tandem_Nmax)
        standems = tandem_grouper(sbed,
                                  filtered_blasts,
                                  flip=False,
                                  tandem_Nmax=tandem_Nmax)

        qdups_fh = open(op.splitext(opts.qbed)[0] + ".localdups", "w") \
                if opts.tandems_only else None

        if is_self:
            for s in standems:
                qtandems.join(*s)
            qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh)
            sdups_to_mother = qdups_to_mother
        else:
            qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh)
            sdups_fh = open(op.splitext(opts.sbed)[0] + ".localdups", "w") \
                    if opts.tandems_only else None
            sdups_to_mother = write_localdups(standems, sbed, sdups_fh)

        if opts.tandems_only:
            # write out new .bed after tandem removal
            write_new_bed(qbed, qdups_to_mother)
            if not is_self:
                write_new_bed(sbed, sdups_to_mother)

            # just want to use this script as a tandem finder.
            #sys.exit()

        before_filter = len(filtered_blasts)
        filtered_blasts = list(filter_tandem(filtered_blasts, \
                qdups_to_mother, sdups_to_mother))
        logging.debug("after filter (%d->%d) .." % \
                (before_filter, len(filtered_blasts)))

    blastfilteredfile = blast_file + ".filtered"
    fw = open(blastfilteredfile, "w")
    write_new_blast(filtered_blasts, fh=fw)
    fw.close()
Exemple #29
0
def loss(args):
    """
    %prog loss a.b.i1.blocks [a.b-genomic.blast]

    Extract likely gene loss candidates between genome a and b.
    """
    p = OptionParser(loss.__doc__)
    p.add_option("--bed",
                 default=False,
                 action="store_true",
                 help="Genomic BLAST is in bed format [default: %default]")
    p.add_option("--gdist",
                 default=20,
                 type="int",
                 help="Gene distance [default: %default]")
    p.add_option("--bdist",
                 default=20000,
                 type="int",
                 help="Base pair distance [default: %default]")
    p.set_beds()
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    blocksfile = args[0]
    emptyblast = (len(args) == 1)
    if emptyblast:
        genomicblast = "empty.blast"
        sh("touch {0}".format(genomicblast))
    else:
        genomicblast = args[1]

    gdist, bdist = opts.gdist, opts.bdist
    qbed, sbed, qorder, sorder, is_self = check_beds(blocksfile, p, opts)
    blocks = []
    fp = open(blocksfile)
    genetrack = {}
    proxytrack = {}
    for row in fp:
        a, b = row.split()
        genetrack[a] = b
        blocks.append((a, b))

    data = []
    for key, rows in groupby(blocks, key=lambda x: x[-1]):
        rows = list(rows)
        data.append((key, rows))

    imax = len(data) - 1
    for i, (key, rows) in enumerate(data):
        if i == 0 or i == imax:
            continue
        if key != '.':
            continue

        before, br = data[i - 1]
        after, ar = data[i + 1]
        bi, bx = sorder[before]
        ai, ax = sorder[after]
        dist = abs(bi - ai)
        if bx.seqid != ax.seqid or dist > gdist:
            continue

        start, end = range_minmax(((bx.start, bx.end), (ax.start, ax.end)))
        start, end = max(start - bdist, 1), end + bdist
        proxy = (bx.seqid, start, end)
        for a, b in rows:
            proxytrack[a] = proxy

    tags = {}
    if opts.bed:
        bed = Bed(genomicblast, sorted=False)
        key = lambda x: gene_name(x.accn.rsplit(".", 1)[0])
        for query, bb in groupby(bed, key=key):
            bb = list(bb)
            if query not in proxytrack:
                continue

            proxy = proxytrack[query]
            tag = "NS"
            best_b = bb[0]
            for b in bb:
                hsp = (b.seqid, b.start, b.end)
                if range_overlap(proxy, hsp):
                    tag = "S"
                    best_b = b
                    break

            hsp = (best_b.seqid, best_b.start, best_b.end)
            proxytrack[query] = hsp
            tags[query] = tag

    else:
        blast = Blast(genomicblast)
        for query, bb in blast.iter_hits():
            bb = list(bb)
            query = gene_name(query)
            if query not in proxytrack:
                continue

            proxy = proxytrack[query]
            tag = "NS"
            best_b = bb[0]
            for b in bb:
                hsp = (b.subject, b.sstart, b.sstop)
                if range_overlap(proxy, hsp):
                    tag = "S"
                    best_b = b
                    break

            hsp = (best_b.subject, best_b.sstart, best_b.sstop)
            proxytrack[query] = hsp
            tags[query] = tag

    for b in qbed:
        accn = b.accn
        target_region = genetrack[accn]
        if accn in proxytrack:
            target_region = region_str(proxytrack[accn])
            if accn in tags:
                ptag = "[{0}]".format(tags[accn])
            else:
                ptag = "[NF]"
            target_region = ptag + target_region

        print "\t".join((b.seqid, accn, target_region))

    if emptyblast:
        sh("rm -f {0}".format(genomicblast))
Exemple #30
0
 def anchorline(self):
     return "\t".join((gene_name(self.gene_a), gene_name(self.gene_b),
                      "{:.3f}".format(self.ks)))