Esempio n. 1
0
def build_order(qbed_file, sbed_file):
    print(sys.stderr, "Read annotation files %s and %s" % (qbed_file, sbed_file))
    qbed = Bed(qbed_file)
    sbed = Bed(sbed_file)
    qorder = qbed.get_order()
    sorder = sbed.get_order()
    return qbed, sbed, qorder, sorder
def build_order(qbed_file, sbed_file):
    print >> sys.stderr, "Read annotation files %s and %s" % (qbed_file, sbed_file)

    qbed = Bed(qbed_file)
    sbed = Bed(sbed_file)
    qorder = qbed.get_order()
    sorder = sbed.get_order()
    return qbed, sbed, qorder, sorder
Esempio n. 3
0
def main(qbed_file, sbed_file, blast_file, pairs_file, out_fh, padding, query=True):
    if query:
        qbed = Bed(qbed_file)
        sbed = Bed(sbed_file).get_order()
    else:
        qbed = Bed(sbed_file)
        sbed = Bed(qbed_file).get_order()

    qorder = qbed.get_order()
    pairs, pairs_dict = get_pairs(pairs_file, query)
    if query:
        qorthos = list(set([qaccn for qaccn, saccn in pairs]))
    else:
        qorthos = list(set([saccn for qaccn, saccn in pairs]))
    qaccns = get_pos(qorder, qorthos)
    blasts = blast_grouped(blast_file, query)
    qaccns.sort()
    best_hits = []
    for qi, q in enumerate(qaccns):
        if qi == 0:
            continue
        if qi == len(qaccns):
            continue
        left_ortho = qaccns[qi - 1]
        right_ortho = qaccns[qi]
        if (right_ortho - left_ortho) == 1:
            continue
        #### if new ortho - old is == 1 no orthos inbetween
        new_hits = get_best_hits(left_ortho, right_ortho, blasts, qbed, sbed, pairs_dict, padding)
        best_hits += new_hits

    write_best_hits(out_fh, best_hits)
    return best_hits
Esempio n. 4
0
def main(qbed_file,sbed_file,blast_file,pairs_file,out_fh,padding,query=True):
    if query:
        qbed = Bed(qbed_file)
        sbed = Bed(sbed_file).get_order()
    else:
        qbed = Bed(sbed_file)
        sbed = Bed(qbed_file).get_order()

    qorder = qbed.get_order()
    pairs,pairs_dict = get_pairs(pairs_file,query)
    if query: qorthos = list(set([qaccn for qaccn,saccn in pairs]))
    else: qorthos = list(set([saccn for qaccn,saccn in pairs]))
    qaccns = get_pos(qorder,qorthos)
    qaccns.sort()
    flankers = []
    for qi,q in enumerate(qaccns):
        if qi == 0: continue
        if qi == len(qaccns) - 1: continue
        left_pos = q -1
        right_pos = q + 1
        if left_pos in qaccns and right_pos in qaccns:
            #print qbed[q].accn
            flankers.append(qbed[q].accn)
    write_best_hits(out_fh,flankers)
    return flankers
Esempio n. 5
0
class RawBed(object):
    """takes line from habos raw file and converts to brents bed line"""

    def __init__(self,bed):
        self.bed = Bed(bed)
        self.order = self.bed.get_order()

    def raw_to_bed(self,raw_pos):
        """returns the bed file for the raw_pos"""
        bed_info = self.bed[raw_pos]
        d = {}
        d['start'] = bed_info.start
        d['end'] = bed_info.end
        d['seqid'] = bed_info.seqid
        d['accn'] = bed_info.accn
        args = bed_info.stuff
        d['strand'] = args[1]
        #d['locs'] = loc_conv(args[-2],args[-1])
        return d

    def accn_to_raw(self,accn):
        "returns the raw line inputs for accn"
        pos =self.order[accn][0]
        seqid = self.order[accn][1].seqid
        return pos,seqid
def main(blast_file, options):
    qbed_file, sbed_file = options.qbed, options.sbed
    sqlite = options.sqlite
    
    print >>sys.stderr, "read annotation files %s and %s" % (qbed_file, sbed_file)
    qbed = Bed(qbed_file)
    sbed = Bed(sbed_file)

    qorder = qbed.get_order()
    sorder = sbed.get_order()

    fp = file(blast_file)
    print >>sys.stderr, "read BLAST file %s (total %d lines)" % \
            (blast_file, sum(1 for line in fp))
    fp.seek(0)
    blasts = sorted([BlastLine(line) for line in fp], \
            key=lambda b: b.score, reverse=True)

    all_data = []
    for b in blasts:
        query, subject = b.query, b.subject
        if query not in qorder or subject not in sorder: continue
        qi, q = qorder[query]
        si, s = sorder[subject]
        all_data.append((qi, si))

    c = None
    if options.sqlite:
        conn = sqlite3.connect(options.sqlite)
        c = conn.cursor()
        c.execute("drop table if exists synteny")
        c.execute("create table synteny (query text, anchor text, gray varchar(1), score integer, dr integer, "
                "orientation varchar(1), qnote text, snote text)")

    batch_query(qbed, sbed, all_data, options, c=c, transpose=False)
    batch_query(qbed, sbed, all_data, options, c=c, transpose=True)

    if sqlite:
        c.execute("create index q on synteny (query)")
        conn.commit()
        c.close()
Esempio n. 7
0
def main(qbed_file,sbed_file,pairs_file,out,query=True):
    out_fh = open(out,"wb")
    if query:
        qbed = Bed(qbed_file)
        sbed = Bed(sbed_file).get_order()
    else:
        qbed = Bed(sbed_file)
        sbed = Bed(qbed_file).get_order()
        
    qorder = qbed.get_order()
    pairs,pairs_dict = get_pairs(pairs_file,query)
    if query: qorthos = list(set([qaccn for qaccn,saccn in pairs]))
    else: qorthos = list(set([saccn for qaccn,saccn in pairs]))
    qaccns = get_pos(qorder,qorthos)
    for qi in qaccns:
        ### had to change from int search to term because of issues with
        ### merging see Os12g12370
        left_ortho = qbed[qi-1].accn in qorthos
        right_ortho = qbed[qi+1].accn in qorthos
        line = "{0}\t{1}\t{2}\n".format(qbed[qi].accn,left_ortho,right_ortho)
        out_fh.write(line)
    out_fh.close()
Esempio n. 8
0
class RawBed(object):
    """takes line from habos raw file and converts to brents bed line"""
    def __init__(self, bed):
        self.bed = Bed(bed)
        self.order = self.bed.get_order()

    def raw_to_bed(self, raw_pos):
        """returns the bed file for the raw_pos"""
        bed_info = self.bed[raw_pos]
        d = {}
        d['start'] = bed_info.start
        d['end'] = bed_info.end
        d['seqid'] = bed_info.seqid
        d['accn'] = bed_info.accn
        args = bed_info.stuff
        d['strand'] = args[1]
        #d['locs'] = loc_conv(args[-2],args[-1])
        return d

    def accn_to_raw(self, accn):
        "returns the raw line inputs for accn"
        pos = self.order[accn][0]
        seqid = self.order[accn][1].seqid
        return pos, seqid
Esempio n. 9
0
def main(blast_file, options):

    qbed_file, sbed_file = options.qbed, options.sbed

    # is this a self-self blast?
    is_self = (qbed_file == sbed_file)
    if is_self:
        print >>sys.stderr, "... looks like a self-self BLAST to me"
    
    global_density_ratio = options.global_density_ratio
    tandem_Nmax = options.tandem_Nmax
    filter_repeats = options.filter_repeats
    cscore = options.cscore

    print >>sys.stderr, "read annotation files %s and %s" % (qbed_file, sbed_file)
    qbed = Bed(qbed_file)
    sbed = Bed(sbed_file)

    qorder = qbed.get_order()
    sorder = sbed.get_order()

    fp = file(blast_file)
    print >>sys.stderr, "read BLAST file %s (total %d lines)" % \
            (blast_file, sum(1 for line in fp))
    fp.seek(0)
    fp2 = []
    for x in fp:
        if x[0] == '#': continue
        fp2.append(x)
    blasts = sorted([BlastLine(line) for line in fp2], \
            key=lambda b: b.score, reverse=True)

    filtered_blasts = []
    seen = set() 
    ostrip = options.strip_names
    for b in blasts:
        query, subject = b.query, b.subject
        if ostrip:
            query, subject = gene_name(query), gene_name(subject)
        if query not in qorder:
            print >>sys.stderr, "WARNING: %s not in %s" % (query, qbed.filename)
            continue
        if subject not in sorder:
            print >>sys.stderr, "WARNING: %s not in %s" % (subject, sbed.filename)
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]
        
        if is_self and qi > si:
            # move all hits to same side when doing self-self BLAST
            query, subject = subject, query
            qi, si = si, qi
            q, s = s, q

        key = query, subject
        if key in seen: continue
        seen.add(key)
        b.query, b.subject = key

        b.qi, b.si = qi, si
        b.qseqid, b.sseqid = q['seqid'], s['seqid']
        
        filtered_blasts.append(b)


    if global_density_ratio:
        print >>sys.stderr, "running the global_density filter" + \
                "(global_density_ratio=%d)..." % options.global_density_ratio
        gene_count = len(qorder) + len(sorder)
        before_filter = len(filtered_blasts)
        filtered_blasts = filter_to_global_density(filtered_blasts, gene_count,
                                                   global_density_ratio)
        print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts))

    if tandem_Nmax:
        print >>sys.stderr, "running the local dups filter (tandem_Nmax=%d)..." % tandem_Nmax

        qtandems = tandem_grouper(qbed, filtered_blasts,
                flip=True, tandem_Nmax=tandem_Nmax)
        standems = tandem_grouper(sbed, filtered_blasts, 
                flip=False, tandem_Nmax=tandem_Nmax)

        qdups_fh = open(op.splitext(qbed_file)[0] + ".localdups", "w")

        if is_self:
            for s in standems: qtandems.join(*s)
            qdups_to_mother = write_localdups(qdups_fh, qtandems, qbed)
            sdups_to_mother = qdups_to_mother
        else:
            qdups_to_mother = write_localdups(qdups_fh, qtandems, qbed)
            sdups_fh = open(op.splitext(sbed_file)[0] + ".localdups", "w")
            sdups_to_mother = write_localdups(sdups_fh, standems, sbed)

        if options.tandems_only:
            # just want to use this script as a tandem finder.
            sys.exit()

        # write out new .bed after tandem removal
        write_new_bed(qbed, qdups_to_mother)
        if not is_self:
            write_new_bed(sbed, sdups_to_mother)
        
        before_filter = len(filtered_blasts)
        filtered_blasts = list(filter_tandem(filtered_blasts, \
                qdups_to_mother, sdups_to_mother))
        print >>sys.stderr, "after filter (%d->%d)..." % \
                (before_filter, len(filtered_blasts))

        qnew_name = "%s.nolocaldups%s" % op.splitext(qbed.filename)
        snew_name = "%s.nolocaldups%s" % op.splitext(sbed.filename)

        qbed_new = Bed(qnew_name)
        sbed_new = Bed(snew_name)

        qorder = qbed_new.get_order()
        sorder = sbed_new.get_order()

    if filter_repeats:
        before_filter = len(filtered_blasts)
        print >>sys.stderr, "running the repeat filter", 
        filtered_blasts = list(filter_repeat(filtered_blasts))
        print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts))

    if cscore:
        before_filter = len(filtered_blasts)
        print >>sys.stderr, "running the cscore filter (cscore>=%.2f)..." % cscore
        filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore))
        print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts))

    # this is the final output we will write to after BLAST filters
    raw_name = "%s.raw" % op.splitext(blast_file)[0]
    raw_fh = open(raw_name, "w")

    write_raw(qorder, sorder, filtered_blasts, raw_fh)
    if options.write_filtered_blast:
        write_new_blast(filtered_blasts)
Esempio n. 10
0
def main(blast_file, options):

    qbed_file, sbed_file = options.qbed, options.sbed

    # is this a self-self blast?
    is_self = (qbed_file == sbed_file)
    if is_self:
        print >>sys.stderr, "... looks like a self-self BLAST to me"
    
    global_density_ratio = options.global_density_ratio
    tandem_Nmax = options.tandem_Nmax
    filter_repeats = options.filter_repeats
    cscore = options.cscore
    localdups = options.localdups

    print >>sys.stderr, "read annotation files %s and %s" % (qbed_file, sbed_file)
    qbed = Bed(qbed_file)
    sbed = Bed(sbed_file)

    qorder = qbed.get_order()
    sorder = sbed.get_order()

    fp = file(blast_file)
    print >>sys.stderr, "read BLAST file %s (total %d lines)" % \
            (blast_file, sum(1 for line in fp))
    fp.seek(0)
    
    # mdb added 3/18/16 for Last v731
    blasts = []
    for line in fp:
        if not line.startswith("#"):
            blasts.append(BlastLine(line))
    blasts = sorted(blasts, key=lambda b: b.score, reverse=True)
            
    # mdb removed 3/18/16 for Last v731
#     blasts = sorted([BlastLine(line) for line in fp], \
#             key=lambda b: b.score, reverse=True)

    filtered_blasts = []
    seen = set() 
    ostrip = options.strip_names
    for b in blasts:
        query, subject = b.query, b.subject
        #if ostrip:
        #    query, subject = gene_name(query), gene_name(subject)
        if query not in qorder:
            print >>sys.stderr, "WARNING: %s not in %s" % (query, qbed.filename)
            continue
        if subject not in sorder:
            print >>sys.stderr, "WARNING: %s not in %s" % (subject, sbed.filename)
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]
        
        if is_self and qi > si:
            # move all hits to same side when doing self-self BLAST
            query, subject = subject, query
            qi, si = si, qi
            q, s = s, q

        key = query, subject
        if key in seen: continue
        seen.add(key)
        b.query, b.subject = key

        b.qi, b.si = qi, si
        b.qseqid, b.sseqid = q['seqid'], s['seqid']
        
        filtered_blasts.append(b)


    if global_density_ratio:
        print >>sys.stderr, "running the global_density filter" + \
                "(global_density_ratio=%d)..." % options.global_density_ratio
        gene_count = len(qorder) + len(sorder)
        before_filter = len(filtered_blasts)
        filtered_blasts = filter_to_global_density(filtered_blasts, gene_count,
                                                   global_density_ratio)
        print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts))

    if tandem_Nmax:
        print >>sys.stderr, "running the local dups filter (tandem_Nmax=%d)..." % tandem_Nmax

        qtandems = tandem_grouper(qbed, filtered_blasts,
                flip=True, tandem_Nmax=tandem_Nmax)
        standems = tandem_grouper(sbed, filtered_blasts, 
                flip=False, tandem_Nmax=tandem_Nmax)

        qdups_fh = open(op.splitext(qbed_file)[0] + ".localdups", "w") if localdups else None

        if is_self:
            for s in standems: qtandems.join(*s)
            qdups_to_mother = write_localdups(qdups_fh, qtandems, qbed)
            sdups_to_mother = qdups_to_mother
        else:
            qdups_to_mother = write_localdups(qdups_fh, qtandems, qbed)
            sdups_fh = open(op.splitext(sbed_file)[0] + ".localdups", "w") if localdups else None
            sdups_to_mother = write_localdups(sdups_fh, standems, sbed)

        if localdups:
            # write out new .bed after tandem removal
            write_new_bed(qbed, qdups_to_mother)
            if not is_self:
                write_new_bed(sbed, sdups_to_mother)
        
        before_filter = len(filtered_blasts)
        filtered_blasts = list(filter_tandem(filtered_blasts, \
                qdups_to_mother, sdups_to_mother))
        print >>sys.stderr, "after filter (%d->%d)..." % \
                (before_filter, len(filtered_blasts))

        qbed.beds = [x for x in qbed if x["accn"] not in qdups_to_mother]
        sbed.beds = [x for x in sbed if x["accn"] not in sdups_to_mother]

        qorder = qbed.get_order()
        sorder = sbed.get_order()

    if filter_repeats:
        before_filter = len(filtered_blasts)
        print >>sys.stderr, "running the repeat filter", 
        filtered_blasts = list(filter_repeat(filtered_blasts))
        print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts))

    if cscore:
        before_filter = len(filtered_blasts)
        print >>sys.stderr, "running the cscore filter (cscore>=%.2f)..." % cscore
        filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore))
        print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts))

    # this is the final output we will write to after BLAST filters
    #raw_name = "%s.raw" % op.splitext(blast_file)[0]
    #raw_fh = open(raw_name, "w")

    #write_raw(qorder, sorder, filtered_blasts, raw_fh)
    write_new_blast(filtered_blasts) 
Esempio n. 11
0
def main(anchor_file, blast_file, options):

    qbed_file, sbed_file = options.qbed, options.sbed
    # is this a self-self blast?
    is_self = (qbed_file == sbed_file)
    if is_self:
        print >> sys.stderr, "... looks like a self-self BLAST to me"

    print >> sys.stderr, "read annotation files %s and %s" % (qbed_file,
                                                              sbed_file)
    qbed = Bed(qbed_file)
    sbed = Bed(sbed_file)
    qorder = qbed.get_order()
    sorder = sbed.get_order()
    _ = lambda x: x.rsplit(".", 1)[0]

    fp = file(blast_file)
    print >>sys.stderr, "read BLAST file %s (total %d lines)" % \
            (blast_file, sum(1 for line in fp))
    fp.seek(0)
    blasts = sorted([BlastLine(line) for line in fp], \
            key=lambda b: b.score, reverse=True)
    filtered_blasts = []
    seen = set()
    for b in blasts:
        query, subject = _(b.query), _(b.subject)
        if query not in qorder or subject not in sorder: continue
        qi, q = qorder[query]
        si, s = sorder[subject]

        if is_self and qi > si:
            # remove redundant a<->b to one side when doing self-self BLAST
            query, subject = subject, query
            qi, si = si, qi
            q, s = s, q

        key = query, subject
        if key in seen: continue
        seen.add(key)
        b.query, b.subject = key

        b.qi, b.si = qi, si
        b.qseqid, b.sseqid = q.seqid, s.seqid

        filtered_blasts.append(b)

    all_anchors = collections.defaultdict(list)
    fp = file(anchor_file)
    for row in fp:
        if row[0] == '#': continue
        a, b = row.split()
        if a not in qorder or b not in sorder: continue
        qi, q = qorder[a]
        si, s = sorder[b]
        all_anchors[(q.seqid, s.seqid)].append((qi, si))

    # grouping the hits based on chromosome pair for sending in find_nearby
    all_hits = collections.defaultdict(list)
    for b in filtered_blasts:
        all_hits[(b.qseqid, b.sseqid)].append((b.qi, b.si))

    # select hits that are close to the anchor list
    j = 0
    fw = sys.stdout
    for chr_pair in sorted(all_hits.keys()):
        hits = np.array(all_hits[chr_pair])
        anchors = np.array(all_anchors[chr_pair])

        print >> sys.stderr, chr_pair, len(anchors)
        if len(anchors) == 0: continue
        tree = cKDTree(anchors, leafsize=16)
        #print tree.data
        dists, idxs = tree.query(hits, p=1, distance_upper_bound=options.dist)
        #print [(d, idx) for (d, idx) in zip(dists, idxs) if idx!=tree.n]

        for i, (dd, idx) in enumerate(zip(dists, idxs)):
            if dd == 0: continue  # same anchors
            if idx != tree.n:
                qi, si = hits[i]
                query, subject = qbed[qi]["accn"], sbed[si]["accn"]
                print >> fw, "\t".join((query, subject, "lifted"))
                j += 1

    print >> sys.stderr, j, "new pairs found"
def main(anchor_file, blast_file, options):

    qbed_file, sbed_file = options.qbed, options.sbed
    # is this a self-self blast?
    is_self = (qbed_file == sbed_file)
    if is_self:
        print >>sys.stderr, "... looks like a self-self BLAST to me"

    print >>sys.stderr, "read annotation files %s and %s" % (qbed_file, sbed_file)
    qbed = Bed(qbed_file)
    sbed = Bed(sbed_file)
    qorder = qbed.get_order()
    sorder = sbed.get_order()
    _ = lambda x: x.rsplit(".", 1)[0]

    fp = file(blast_file)
    print >>sys.stderr, "read BLAST file %s (total %d lines)" % \
            (blast_file, sum(1 for line in fp))
    fp.seek(0)
    blasts = sorted([BlastLine(line) for line in fp], \
            key=lambda b: b.score, reverse=True)
    filtered_blasts = []
    seen = set()
    for b in blasts:
        query, subject = _(b.query), _(b.subject)
        if query not in qorder or subject not in sorder: continue
        qi, q = qorder[query]
        si, s = sorder[subject]

        if is_self and qi > si:
            # remove redundant a<->b to one side when doing self-self BLAST
            query, subject = subject, query
            qi, si = si, qi
            q, s = s, q

        key = query, subject
        if key in seen: continue
        seen.add(key)
        b.query, b.subject = key

        b.qi, b.si = qi, si
        b.qseqid, b.sseqid = q.seqid, s.seqid
        
        filtered_blasts.append(b)


    all_anchors = collections.defaultdict(list)
    fp = file(anchor_file)
    for row in fp:
        if row[0]=='#': continue
        a, b = row.split()
        if a not in qorder or b not in sorder: continue
        qi, q = qorder[a]
        si, s = sorder[b]
        all_anchors[(q.seqid, s.seqid)].append((qi, si))

    # grouping the hits based on chromosome pair for sending in find_nearby
    all_hits = collections.defaultdict(list)
    for b in filtered_blasts:
        all_hits[(b.qseqid, b.sseqid)].append((b.qi, b.si))

    # select hits that are close to the anchor list
    j = 0
    fw = sys.stdout
    for chr_pair in sorted(all_hits.keys()):
        hits = np.array(all_hits[chr_pair])
        anchors = np.array(all_anchors[chr_pair])

        print >>sys.stderr, chr_pair, len(anchors)
        if len(anchors)==0: continue
        tree = cKDTree(anchors, leafsize=16)
        #print tree.data
        dists, idxs = tree.query(hits, p=1, distance_upper_bound=options.dist)
        #print [(d, idx) for (d, idx) in zip(dists, idxs) if idx!=tree.n]

        for i, (dd, idx) in enumerate(zip(dists, idxs)):
            if dd==0: continue # same anchors
            if idx!=tree.n:
                qi, si = hits[i]
                query, subject = qbed[qi]["accn"], sbed[si]["accn"]
                print >>fw, "\t".join((query, subject, "lifted"))
                j+=1
    
    print >>sys.stderr, j, "new pairs found"