コード例 #1
0
def main(gff_file,bed_corrected):
    gff = Gff(gff_file)
    bed = Bed(bed_corrected)
    gffLength = gff.__len__()
    gff_index = 0
    info, before = '',''
    fp=open(gff_file+'.modified','w')
    for index in range(gffLength):
        item = gff.__getitem__(index)
        info = item.accn
        if item.accn != 'UNDEF':
            if item.type == 'gene':
                gene_bed = bed.__getitem__(gff_index).__str__().split('\t')[3]
                gene_list = gene_bed.split('|')
                paralist = item.accn.split(';')
                print(gene_bed, paralist)
                if len(gene_list) == 1 and len(paralist)==2:
                    gene1, gene2 = paralist[0].split(',')[0], paralist[1].split(',')[0]
                    if gene_bed == gene1:
                        item.__setattr__("accn", paralist[0])
                        before = paralist[0]
                    elif gene_bed == gene2:
                        item.__setattr__("accn", paralist[1])
                        before = paralist[1]
                    else:
                        before = item.accn
                else :
                    before = item.accn
                gff_index = gff_index + 1
            else:
                item.__setattr__("accn",before)
        else:
            pass
        fp.write(str(item)+'\n')
    fp.close()
コード例 #2
0
ファイル: best_hit_ortho.py プロジェクト: gturco/best_hit
def main(qbed_file, sbed_file, blast_file, pairs_file, out_fh, padding, query=True):
    if query:
        qbed = Bed(qbed_file)
        sbed = Bed(sbed_file).get_order()
    else:
        qbed = Bed(sbed_file)
        sbed = Bed(qbed_file).get_order()

    qorder = qbed.get_order()
    pairs, pairs_dict = get_pairs(pairs_file, query)
    if query:
        qorthos = list(set([qaccn for qaccn, saccn in pairs]))
    else:
        qorthos = list(set([saccn for qaccn, saccn in pairs]))
    qaccns = get_pos(qorder, qorthos)
    blasts = blast_grouped(blast_file, query)
    qaccns.sort()
    best_hits = []
    for qi, q in enumerate(qaccns):
        if qi == 0:
            continue
        if qi == len(qaccns):
            continue
        left_ortho = qaccns[qi - 1]
        right_ortho = qaccns[qi]
        if (right_ortho - left_ortho) == 1:
            continue
        #### if new ortho - old is == 1 no orthos inbetween
        new_hits = get_best_hits(left_ortho, right_ortho, blasts, qbed, sbed, pairs_dict, padding)
        best_hits += new_hits

    write_best_hits(out_fh, best_hits)
    return best_hits
コード例 #3
0
ファイル: syntenic_flankers.py プロジェクト: gturco/best_hit
def main(qbed_file,sbed_file,blast_file,pairs_file,out_fh,padding,query=True):
    if query:
        qbed = Bed(qbed_file)
        sbed = Bed(sbed_file).get_order()
    else:
        qbed = Bed(sbed_file)
        sbed = Bed(qbed_file).get_order()

    qorder = qbed.get_order()
    pairs,pairs_dict = get_pairs(pairs_file,query)
    if query: qorthos = list(set([qaccn for qaccn,saccn in pairs]))
    else: qorthos = list(set([saccn for qaccn,saccn in pairs]))
    qaccns = get_pos(qorder,qorthos)
    qaccns.sort()
    flankers = []
    for qi,q in enumerate(qaccns):
        if qi == 0: continue
        if qi == len(qaccns) - 1: continue
        left_pos = q -1
        right_pos = q + 1
        if left_pos in qaccns and right_pos in qaccns:
            #print qbed[q].accn
            flankers.append(qbed[q].accn)
    write_best_hits(out_fh,flankers)
    return flankers
コード例 #4
0
def build_order(qbed_file, sbed_file):
    print >> sys.stderr, "Read annotation files %s and %s" % (qbed_file, sbed_file)

    qbed = Bed(qbed_file)
    sbed = Bed(sbed_file)
    qorder = qbed.get_order()
    sorder = sbed.get_order()
    return qbed, sbed, qorder, sorder
コード例 #5
0
ファイル: qa_parsers.py プロジェクト: gturco/find_cns
class RawBed(object):
    """takes line from habos raw file and converts to brents bed line"""

    def __init__(self,bed):
        self.bed = Bed(bed)
        self.order = self.bed.get_order()

    def raw_to_bed(self,raw_pos):
        """returns the bed file for the raw_pos"""
        bed_info = self.bed[raw_pos]
        d = {}
        d['start'] = bed_info.start
        d['end'] = bed_info.end
        d['seqid'] = bed_info.seqid
        d['accn'] = bed_info.accn
        args = bed_info.stuff
        d['strand'] = args[1]
        #d['locs'] = loc_conv(args[-2],args[-1])
        return d

    def accn_to_raw(self,accn):
        "returns the raw line inputs for accn"
        pos =self.order[accn][0]
        seqid = self.order[accn][1].seqid
        return pos,seqid
コード例 #6
0
def main(blast_file, options):
    qbed_file, sbed_file = options.qbed, options.sbed
    sqlite = options.sqlite
    
    print >>sys.stderr, "read annotation files %s and %s" % (qbed_file, sbed_file)
    qbed = Bed(qbed_file)
    sbed = Bed(sbed_file)

    qorder = qbed.get_order()
    sorder = sbed.get_order()

    fp = file(blast_file)
    print >>sys.stderr, "read BLAST file %s (total %d lines)" % \
            (blast_file, sum(1 for line in fp))
    fp.seek(0)
    blasts = sorted([BlastLine(line) for line in fp], \
            key=lambda b: b.score, reverse=True)

    all_data = []
    for b in blasts:
        query, subject = b.query, b.subject
        if query not in qorder or subject not in sorder: continue
        qi, q = qorder[query]
        si, s = sorder[subject]
        all_data.append((qi, si))

    c = None
    if options.sqlite:
        conn = sqlite3.connect(options.sqlite)
        c = conn.cursor()
        c.execute("drop table if exists synteny")
        c.execute("create table synteny (query text, anchor text, gray varchar(1), score integer, dr integer, "
                "orientation varchar(1), qnote text, snote text)")

    batch_query(qbed, sbed, all_data, options, c=c, transpose=False)
    batch_query(qbed, sbed, all_data, options, c=c, transpose=True)

    if sqlite:
        c.execute("create index q on synteny (query)")
        conn.commit()
        c.close()
コード例 #7
0
ファイル: qa_parsers.py プロジェクト: yuzhenpeng/find_cns
def write_nolocaldups(bed_path, localdups_file, out_name):
    bed = Bed(bed_path)
    children = []
    for line in open(localdups_file):
        dups = DupLine(line)
        children += dups.children
    print >> sys.stderr, "write tandem-filtered bed file {0}".format(out_name)
    fh = open(out_name, "w")
    for i, row in enumerate(bed):
        if row['accn'] in children: continue
        print >> fh, row
    fh.close()
コード例 #8
0
def build_order(qbed_file, sbed_file):
    print(sys.stderr, "Read annotation files %s and %s" % (qbed_file, sbed_file))
    qbed = Bed(qbed_file)
    sbed = Bed(sbed_file)
    qorder = qbed.get_order()
    sorder = sbed.get_order()
    return qbed, sbed, qorder, sorder
コード例 #9
0
def main(qfasta, sfasta, options):
    qfasta = Fasta(qfasta)
    sfasta = Fasta(sfasta)

    if not (options.qbed and options.sbed):
        anchors = PositionAnchor(options.anchors)
    else:
        qbed = Bed(options.qbed)
        sbed = Bed(options.sbed)
        anchors = Anchor(options.anchors, qbed, sbed)
    cpus = cpu_count()
    pool = Pool(cpus)

    for i, command_group in enumerate(
            anchors.gen_cmds(qfasta, sfasta, options.dist, options.cmd)):
        if not (i - 1) % 500:
            print >> sys.stderr, "complete: %.5f" % ((
                (i - 1.0) * cpus) / len(anchors))
        for lines in pool.map(run_blast, command_group):
            for line in lines:
                line[6:10] = map(str, line[6:10])
                print "\t".join(line)
コード例 #10
0
ファイル: synteny.py プロジェクト: gturco/best_hit
def main(qbed_file,sbed_file,pairs_file,out,query=True):
    out_fh = open(out,"wb")
    if query:
        qbed = Bed(qbed_file)
        sbed = Bed(sbed_file).get_order()
    else:
        qbed = Bed(sbed_file)
        sbed = Bed(qbed_file).get_order()
        
    qorder = qbed.get_order()
    pairs,pairs_dict = get_pairs(pairs_file,query)
    if query: qorthos = list(set([qaccn for qaccn,saccn in pairs]))
    else: qorthos = list(set([saccn for qaccn,saccn in pairs]))
    qaccns = get_pos(qorder,qorthos)
    for qi in qaccns:
        ### had to change from int search to term because of issues with
        ### merging see Os12g12370
        left_ortho = qbed[qi-1].accn in qorthos
        right_ortho = qbed[qi+1].accn in qorthos
        line = "{0}\t{1}\t{2}\n".format(qbed[qi].accn,left_ortho,right_ortho)
        out_fh.write(line)
    out_fh.close()
コード例 #11
0
ファイル: qa_parsers.py プロジェクト: yuzhenpeng/find_cns
class RawBed(object):
    """takes line from habos raw file and converts to brents bed line"""
    def __init__(self, bed):
        self.bed = Bed(bed)
        self.order = self.bed.get_order()

    def raw_to_bed(self, raw_pos):
        """returns the bed file for the raw_pos"""
        bed_info = self.bed[raw_pos]
        d = {}
        d['start'] = bed_info.start
        d['end'] = bed_info.end
        d['seqid'] = bed_info.seqid
        d['accn'] = bed_info.accn
        args = bed_info.stuff
        d['strand'] = args[1]
        #d['locs'] = loc_conv(args[-2],args[-1])
        return d

    def accn_to_raw(self, accn):
        "returns the raw line inputs for accn"
        pos = self.order[accn][0]
        seqid = self.order[accn][1].seqid
        return pos, seqid
コード例 #12
0
ファイル: qa_parsers.py プロジェクト: gturco/find_cns
 def __init__(self,bed):
     self.bed = Bed(bed)
     self.order = self.bed.get_order()
コード例 #13
0
def main(blast_file, options):

    qbed_file, sbed_file = options.qbed, options.sbed

    # is this a self-self blast?
    is_self = (qbed_file == sbed_file)
    if is_self:
        print >>sys.stderr, "... looks like a self-self BLAST to me"
    
    global_density_ratio = options.global_density_ratio
    tandem_Nmax = options.tandem_Nmax
    filter_repeats = options.filter_repeats
    cscore = options.cscore

    print >>sys.stderr, "read annotation files %s and %s" % (qbed_file, sbed_file)
    qbed = Bed(qbed_file)
    sbed = Bed(sbed_file)

    qorder = qbed.get_order()
    sorder = sbed.get_order()

    fp = file(blast_file)
    print >>sys.stderr, "read BLAST file %s (total %d lines)" % \
            (blast_file, sum(1 for line in fp))
    fp.seek(0)
    fp2 = []
    for x in fp:
        if x[0] == '#': continue
        fp2.append(x)
    blasts = sorted([BlastLine(line) for line in fp2], \
            key=lambda b: b.score, reverse=True)

    filtered_blasts = []
    seen = set() 
    ostrip = options.strip_names
    for b in blasts:
        query, subject = b.query, b.subject
        if ostrip:
            query, subject = gene_name(query), gene_name(subject)
        if query not in qorder:
            print >>sys.stderr, "WARNING: %s not in %s" % (query, qbed.filename)
            continue
        if subject not in sorder:
            print >>sys.stderr, "WARNING: %s not in %s" % (subject, sbed.filename)
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]
        
        if is_self and qi > si:
            # move all hits to same side when doing self-self BLAST
            query, subject = subject, query
            qi, si = si, qi
            q, s = s, q

        key = query, subject
        if key in seen: continue
        seen.add(key)
        b.query, b.subject = key

        b.qi, b.si = qi, si
        b.qseqid, b.sseqid = q['seqid'], s['seqid']
        
        filtered_blasts.append(b)


    if global_density_ratio:
        print >>sys.stderr, "running the global_density filter" + \
                "(global_density_ratio=%d)..." % options.global_density_ratio
        gene_count = len(qorder) + len(sorder)
        before_filter = len(filtered_blasts)
        filtered_blasts = filter_to_global_density(filtered_blasts, gene_count,
                                                   global_density_ratio)
        print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts))

    if tandem_Nmax:
        print >>sys.stderr, "running the local dups filter (tandem_Nmax=%d)..." % tandem_Nmax

        qtandems = tandem_grouper(qbed, filtered_blasts,
                flip=True, tandem_Nmax=tandem_Nmax)
        standems = tandem_grouper(sbed, filtered_blasts, 
                flip=False, tandem_Nmax=tandem_Nmax)

        qdups_fh = open(op.splitext(qbed_file)[0] + ".localdups", "w")

        if is_self:
            for s in standems: qtandems.join(*s)
            qdups_to_mother = write_localdups(qdups_fh, qtandems, qbed)
            sdups_to_mother = qdups_to_mother
        else:
            qdups_to_mother = write_localdups(qdups_fh, qtandems, qbed)
            sdups_fh = open(op.splitext(sbed_file)[0] + ".localdups", "w")
            sdups_to_mother = write_localdups(sdups_fh, standems, sbed)

        if options.tandems_only:
            # just want to use this script as a tandem finder.
            sys.exit()

        # write out new .bed after tandem removal
        write_new_bed(qbed, qdups_to_mother)
        if not is_self:
            write_new_bed(sbed, sdups_to_mother)
        
        before_filter = len(filtered_blasts)
        filtered_blasts = list(filter_tandem(filtered_blasts, \
                qdups_to_mother, sdups_to_mother))
        print >>sys.stderr, "after filter (%d->%d)..." % \
                (before_filter, len(filtered_blasts))

        qnew_name = "%s.nolocaldups%s" % op.splitext(qbed.filename)
        snew_name = "%s.nolocaldups%s" % op.splitext(sbed.filename)

        qbed_new = Bed(qnew_name)
        sbed_new = Bed(snew_name)

        qorder = qbed_new.get_order()
        sorder = sbed_new.get_order()

    if filter_repeats:
        before_filter = len(filtered_blasts)
        print >>sys.stderr, "running the repeat filter", 
        filtered_blasts = list(filter_repeat(filtered_blasts))
        print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts))

    if cscore:
        before_filter = len(filtered_blasts)
        print >>sys.stderr, "running the cscore filter (cscore>=%.2f)..." % cscore
        filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore))
        print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts))

    # this is the final output we will write to after BLAST filters
    raw_name = "%s.raw" % op.splitext(blast_file)[0]
    raw_fh = open(raw_name, "w")

    write_raw(qorder, sorder, filtered_blasts, raw_fh)
    if options.write_filtered_blast:
        write_new_blast(filtered_blasts)
コード例 #14
0
ファイル: qa_plot.py プロジェクト: xdwang1991/coge
    ax.yaxis.set_major_formatter(formatter)
    plt.setp(ax.get_xticklabels() + ax.get_yticklabels(),
             color='gray',
             size=10)

    root.set_axis_off()
    print >> sys.stderr, "print image to %s" % image_name
    plt.savefig(image_name, dpi=600)


if __name__ == "__main__":

    import optparse

    parser = optparse.OptionParser(__doc__)
    parser.add_option("--qbed", dest="qbed", help="path to qbed")
    parser.add_option("--sbed", dest="sbed", help="path to sbed")

    (options, args) = parser.parse_args()

    if not (len(args) == 1 and options.qbed and options.sbed):
        sys.exit(parser.print_help())

    qbed = Bed(options.qbed)
    sbed = Bed(options.sbed)

    qa_file = args[0]

    image_name = op.splitext(qa_file)[0] + ".png"
    dotplot(qa_file, qbed, sbed, image_name)
コード例 #15
0
ファイル: qa_parsers.py プロジェクト: yuzhenpeng/find_cns
 def __init__(self, bed):
     self.bed = Bed(bed)
     self.order = self.bed.get_order()
コード例 #16
0
ファイル: blast_to_raw.py プロジェクト: LyonsLab/coge
def main(blast_file, options):

    qbed_file, sbed_file = options.qbed, options.sbed

    # is this a self-self blast?
    is_self = (qbed_file == sbed_file)
    if is_self:
        print >>sys.stderr, "... looks like a self-self BLAST to me"
    
    global_density_ratio = options.global_density_ratio
    tandem_Nmax = options.tandem_Nmax
    filter_repeats = options.filter_repeats
    cscore = options.cscore
    localdups = options.localdups

    print >>sys.stderr, "read annotation files %s and %s" % (qbed_file, sbed_file)
    qbed = Bed(qbed_file)
    sbed = Bed(sbed_file)

    qorder = qbed.get_order()
    sorder = sbed.get_order()

    fp = file(blast_file)
    print >>sys.stderr, "read BLAST file %s (total %d lines)" % \
            (blast_file, sum(1 for line in fp))
    fp.seek(0)
    
    # mdb added 3/18/16 for Last v731
    blasts = []
    for line in fp:
        if not line.startswith("#"):
            blasts.append(BlastLine(line))
    blasts = sorted(blasts, key=lambda b: b.score, reverse=True)
            
    # mdb removed 3/18/16 for Last v731
#     blasts = sorted([BlastLine(line) for line in fp], \
#             key=lambda b: b.score, reverse=True)

    filtered_blasts = []
    seen = set() 
    ostrip = options.strip_names
    for b in blasts:
        query, subject = b.query, b.subject
        #if ostrip:
        #    query, subject = gene_name(query), gene_name(subject)
        if query not in qorder:
            print >>sys.stderr, "WARNING: %s not in %s" % (query, qbed.filename)
            continue
        if subject not in sorder:
            print >>sys.stderr, "WARNING: %s not in %s" % (subject, sbed.filename)
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]
        
        if is_self and qi > si:
            # move all hits to same side when doing self-self BLAST
            query, subject = subject, query
            qi, si = si, qi
            q, s = s, q

        key = query, subject
        if key in seen: continue
        seen.add(key)
        b.query, b.subject = key

        b.qi, b.si = qi, si
        b.qseqid, b.sseqid = q['seqid'], s['seqid']
        
        filtered_blasts.append(b)


    if global_density_ratio:
        print >>sys.stderr, "running the global_density filter" + \
                "(global_density_ratio=%d)..." % options.global_density_ratio
        gene_count = len(qorder) + len(sorder)
        before_filter = len(filtered_blasts)
        filtered_blasts = filter_to_global_density(filtered_blasts, gene_count,
                                                   global_density_ratio)
        print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts))

    if tandem_Nmax:
        print >>sys.stderr, "running the local dups filter (tandem_Nmax=%d)..." % tandem_Nmax

        qtandems = tandem_grouper(qbed, filtered_blasts,
                flip=True, tandem_Nmax=tandem_Nmax)
        standems = tandem_grouper(sbed, filtered_blasts, 
                flip=False, tandem_Nmax=tandem_Nmax)

        qdups_fh = open(op.splitext(qbed_file)[0] + ".localdups", "w") if localdups else None

        if is_self:
            for s in standems: qtandems.join(*s)
            qdups_to_mother = write_localdups(qdups_fh, qtandems, qbed)
            sdups_to_mother = qdups_to_mother
        else:
            qdups_to_mother = write_localdups(qdups_fh, qtandems, qbed)
            sdups_fh = open(op.splitext(sbed_file)[0] + ".localdups", "w") if localdups else None
            sdups_to_mother = write_localdups(sdups_fh, standems, sbed)

        if localdups:
            # write out new .bed after tandem removal
            write_new_bed(qbed, qdups_to_mother)
            if not is_self:
                write_new_bed(sbed, sdups_to_mother)
        
        before_filter = len(filtered_blasts)
        filtered_blasts = list(filter_tandem(filtered_blasts, \
                qdups_to_mother, sdups_to_mother))
        print >>sys.stderr, "after filter (%d->%d)..." % \
                (before_filter, len(filtered_blasts))

        qbed.beds = [x for x in qbed if x["accn"] not in qdups_to_mother]
        sbed.beds = [x for x in sbed if x["accn"] not in sdups_to_mother]

        qorder = qbed.get_order()
        sorder = sbed.get_order()

    if filter_repeats:
        before_filter = len(filtered_blasts)
        print >>sys.stderr, "running the repeat filter", 
        filtered_blasts = list(filter_repeat(filtered_blasts))
        print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts))

    if cscore:
        before_filter = len(filtered_blasts)
        print >>sys.stderr, "running the cscore filter (cscore>=%.2f)..." % cscore
        filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore))
        print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts))

    # this is the final output we will write to after BLAST filters
    #raw_name = "%s.raw" % op.splitext(blast_file)[0]
    #raw_fh = open(raw_name, "w")

    #write_raw(qorder, sorder, filtered_blasts, raw_fh)
    write_new_blast(filtered_blasts) 
コード例 #17
0
def main(anchor_file, blast_file, options):

    qbed_file, sbed_file = options.qbed, options.sbed
    # is this a self-self blast?
    is_self = (qbed_file == sbed_file)
    if is_self:
        print >>sys.stderr, "... looks like a self-self BLAST to me"

    print >>sys.stderr, "read annotation files %s and %s" % (qbed_file, sbed_file)
    qbed = Bed(qbed_file)
    sbed = Bed(sbed_file)
    qorder = qbed.get_order()
    sorder = sbed.get_order()
    _ = lambda x: x.rsplit(".", 1)[0]

    fp = file(blast_file)
    print >>sys.stderr, "read BLAST file %s (total %d lines)" % \
            (blast_file, sum(1 for line in fp))
    fp.seek(0)
    blasts = sorted([BlastLine(line) for line in fp], \
            key=lambda b: b.score, reverse=True)
    filtered_blasts = []
    seen = set()
    for b in blasts:
        query, subject = _(b.query), _(b.subject)
        if query not in qorder or subject not in sorder: continue
        qi, q = qorder[query]
        si, s = sorder[subject]

        if is_self and qi > si:
            # remove redundant a<->b to one side when doing self-self BLAST
            query, subject = subject, query
            qi, si = si, qi
            q, s = s, q

        key = query, subject
        if key in seen: continue
        seen.add(key)
        b.query, b.subject = key

        b.qi, b.si = qi, si
        b.qseqid, b.sseqid = q.seqid, s.seqid
        
        filtered_blasts.append(b)


    all_anchors = collections.defaultdict(list)
    fp = file(anchor_file)
    for row in fp:
        if row[0]=='#': continue
        a, b = row.split()
        if a not in qorder or b not in sorder: continue
        qi, q = qorder[a]
        si, s = sorder[b]
        all_anchors[(q.seqid, s.seqid)].append((qi, si))

    # grouping the hits based on chromosome pair for sending in find_nearby
    all_hits = collections.defaultdict(list)
    for b in filtered_blasts:
        all_hits[(b.qseqid, b.sseqid)].append((b.qi, b.si))

    # select hits that are close to the anchor list
    j = 0
    fw = sys.stdout
    for chr_pair in sorted(all_hits.keys()):
        hits = np.array(all_hits[chr_pair])
        anchors = np.array(all_anchors[chr_pair])

        print >>sys.stderr, chr_pair, len(anchors)
        if len(anchors)==0: continue
        tree = cKDTree(anchors, leafsize=16)
        #print tree.data
        dists, idxs = tree.query(hits, p=1, distance_upper_bound=options.dist)
        #print [(d, idx) for (d, idx) in zip(dists, idxs) if idx!=tree.n]

        for i, (dd, idx) in enumerate(zip(dists, idxs)):
            if dd==0: continue # same anchors
            if idx!=tree.n:
                qi, si = hits[i]
                query, subject = qbed[qi]["accn"], sbed[si]["accn"]
                print >>fw, "\t".join((query, subject, "lifted"))
                j+=1
    
    print >>sys.stderr, j, "new pairs found"
コード例 #18
0
def dotplot(ax, anchors, qbed, sbed, topn, axes):
    # modified from Haibao Tang's original function
    '''function to plot merged topn anchors.'''
    _ = lambda x: r"$\rm{%s}$" % x.replace(" ", r"\ ")
    get_order = lambda bed: dict(
        (f['accn'], (i, f)) for (i, f) in enumerate(bed))
    get_len = lambda bed: sum([f.end for (i, f) in enumerate(bed)])
    qbed = Bed(qbed)
    sbed = Bed(sbed)
    xmax, ymax = len(qbed), get_len(sbed)
    print xmax, ymax

    qorder = get_order(qbed)
    chr_len = [f.end for (i, f) in enumerate(sbed)]

    # get topn hits
    data = []
    cur_q = ""
    for anchor in anchors:
        if anchor[0] != cur_q: n = 1
        if n > topn: continue
        try:
            qgene = qorder[anchor[0].split('.')[0]]
        except:
            continue
        if 'r' in anchor[0]: continue
        anchor[4] += sum(chr_len[0:(int(anchor[1].lstrip('chr0')) - 1)])
        if qgene[0] < xmax and anchor[4] < ymax:
            data.append((qgene[0], anchor[4]))
        cur_q = anchor[0]
        n += 1

    print 'data length: ', len(data)

    x, y = zip(*data)
    x, y = np.array(x, 'f') / xmax, np.array(y, 'f') / ymax
    ax.scatter(x, y, c='b', s=.5, lw=0, alpha=.8)

    ax.get_xaxis().set_ticks([])
    ax.get_yaxis().set_ticks([])

    xchr_labels, ychr_labels = [], []
    cbreaks = {}
    # plot the chromosome breaks
    for (seqid, beg, end) in get_breaks(qbed):
        if "random" in seqid: continue
        cbreaks[("query", seqid)] = (beg, end)
        xchr_labels.append((seqid, (beg + end) / 2))
        x, y = np.array([beg, beg], 'f') / xmax, [0, 1]
        ax.plot(x, y, "-", color='y', alpha=.8, zorder=10)
    ax.add_patch(
        Rectangle((.998, 0),
                  .002,
                  1,
                  lw=.2,
                  color='y',
                  fc='y',
                  fill=True,
                  alpha=.8,
                  zorder=10))

    get_breaks_subject = lambda bed: [[f.accn, f.start, f.end]
                                      for (i, f) in enumerate(bed)]
    chr_cum = 0
    for items in get_breaks_subject(sbed):
        seqid, beg, end = items
        beg += chr_cum
        end += chr_cum
        if "random" in seqid: continue
        cbreaks[("subject", seqid)] = (beg, end)
        ychr_labels.append((seqid, (beg + end) / 2))
        x, y = [0, 1], np.array([beg, beg], 'f') / ymax
        ax.plot(x, y, "-", color='y', alpha=.8, zorder=10)
        chr_cum = end
    ax.add_patch(
        Rectangle((0, 1),
                  1,
                  .002,
                  lw=.2,
                  color='y',
                  fc='y',
                  fill=True,
                  alpha=.8,
                  zorder=10))

    # plot the chromosome labels
    for label, pos in xchr_labels:
        x, y = pos * 1. / xmax - .015, 1.02
        if label[0] == "0": label = label.replace("0", "")
        ax.text(x, y, _("%s" % label))
    for label, pos in ychr_labels:
        x, y = -.065, pos * 1. / ymax - .0065
        if label[0] == "0": label = label.replace("0", "")
        ax.text(x, y, _("%s" % label))
    # plot axis labels
    ax.text(.5, 1.06, _("%s" % axes[0]))
    ax.text(-.1, .5, _("%s" % axes[1]), rotation="vertical")
コード例 #19
0
def main(anchor_file, blast_file, options):

    qbed_file, sbed_file = options.qbed, options.sbed
    # is this a self-self blast?
    is_self = (qbed_file == sbed_file)
    if is_self:
        print >> sys.stderr, "... looks like a self-self BLAST to me"

    print >> sys.stderr, "read annotation files %s and %s" % (qbed_file,
                                                              sbed_file)
    qbed = Bed(qbed_file)
    sbed = Bed(sbed_file)
    qorder = qbed.get_order()
    sorder = sbed.get_order()
    _ = lambda x: x.rsplit(".", 1)[0]

    fp = file(blast_file)
    print >>sys.stderr, "read BLAST file %s (total %d lines)" % \
            (blast_file, sum(1 for line in fp))
    fp.seek(0)
    blasts = sorted([BlastLine(line) for line in fp], \
            key=lambda b: b.score, reverse=True)
    filtered_blasts = []
    seen = set()
    for b in blasts:
        query, subject = _(b.query), _(b.subject)
        if query not in qorder or subject not in sorder: continue
        qi, q = qorder[query]
        si, s = sorder[subject]

        if is_self and qi > si:
            # remove redundant a<->b to one side when doing self-self BLAST
            query, subject = subject, query
            qi, si = si, qi
            q, s = s, q

        key = query, subject
        if key in seen: continue
        seen.add(key)
        b.query, b.subject = key

        b.qi, b.si = qi, si
        b.qseqid, b.sseqid = q.seqid, s.seqid

        filtered_blasts.append(b)

    all_anchors = collections.defaultdict(list)
    fp = file(anchor_file)
    for row in fp:
        if row[0] == '#': continue
        a, b = row.split()
        if a not in qorder or b not in sorder: continue
        qi, q = qorder[a]
        si, s = sorder[b]
        all_anchors[(q.seqid, s.seqid)].append((qi, si))

    # grouping the hits based on chromosome pair for sending in find_nearby
    all_hits = collections.defaultdict(list)
    for b in filtered_blasts:
        all_hits[(b.qseqid, b.sseqid)].append((b.qi, b.si))

    # select hits that are close to the anchor list
    j = 0
    fw = sys.stdout
    for chr_pair in sorted(all_hits.keys()):
        hits = np.array(all_hits[chr_pair])
        anchors = np.array(all_anchors[chr_pair])

        print >> sys.stderr, chr_pair, len(anchors)
        if len(anchors) == 0: continue
        tree = cKDTree(anchors, leafsize=16)
        #print tree.data
        dists, idxs = tree.query(hits, p=1, distance_upper_bound=options.dist)
        #print [(d, idx) for (d, idx) in zip(dists, idxs) if idx!=tree.n]

        for i, (dd, idx) in enumerate(zip(dists, idxs)):
            if dd == 0: continue  # same anchors
            if idx != tree.n:
                qi, si = hits[i]
                query, subject = qbed[qi]["accn"], sbed[si]["accn"]
                print >> fw, "\t".join((query, subject, "lifted"))
                j += 1

    print >> sys.stderr, j, "new pairs found"