Ejemplo n.º 1
0
def main(blast_file, options):

    qbed_file, sbed_file = options.qbed, options.sbed

    # is this a self-self blast?
    is_self = (qbed_file == sbed_file)
    if is_self:
        print >>sys.stderr, "... looks like a self-self BLAST to me"
    
    global_density_ratio = options.global_density_ratio
    tandem_Nmax = options.tandem_Nmax
    filter_repeats = options.filter_repeats
    cscore = options.cscore
    localdups = options.localdups

    print >>sys.stderr, "read annotation files %s and %s" % (qbed_file, sbed_file)
    qbed = Bed(qbed_file)
    sbed = Bed(sbed_file)

    qorder = qbed.get_order()
    sorder = sbed.get_order()

    fp = file(blast_file)
    print >>sys.stderr, "read BLAST file %s (total %d lines)" % \
            (blast_file, sum(1 for line in fp))
    fp.seek(0)
    
    # mdb added 3/18/16 for Last v731
    blasts = []
    for line in fp:
        if not line.startswith("#"):
            blasts.append(BlastLine(line))
    blasts = sorted(blasts, key=lambda b: b.score, reverse=True)
            
    # mdb removed 3/18/16 for Last v731
#     blasts = sorted([BlastLine(line) for line in fp], \
#             key=lambda b: b.score, reverse=True)

    filtered_blasts = []
    seen = set() 
    ostrip = options.strip_names
    for b in blasts:
        query, subject = b.query, b.subject
        #if ostrip:
        #    query, subject = gene_name(query), gene_name(subject)
        if query not in qorder:
            print >>sys.stderr, "WARNING: %s not in %s" % (query, qbed.filename)
            continue
        if subject not in sorder:
            print >>sys.stderr, "WARNING: %s not in %s" % (subject, sbed.filename)
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]
        
        if is_self and qi > si:
            # move all hits to same side when doing self-self BLAST
            query, subject = subject, query
            qi, si = si, qi
            q, s = s, q

        key = query, subject
        if key in seen: continue
        seen.add(key)
        b.query, b.subject = key

        b.qi, b.si = qi, si
        b.qseqid, b.sseqid = q['seqid'], s['seqid']
        
        filtered_blasts.append(b)


    if global_density_ratio:
        print >>sys.stderr, "running the global_density filter" + \
                "(global_density_ratio=%d)..." % options.global_density_ratio
        gene_count = len(qorder) + len(sorder)
        before_filter = len(filtered_blasts)
        filtered_blasts = filter_to_global_density(filtered_blasts, gene_count,
                                                   global_density_ratio)
        print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts))

    if tandem_Nmax:
        print >>sys.stderr, "running the local dups filter (tandem_Nmax=%d)..." % tandem_Nmax

        qtandems = tandem_grouper(qbed, filtered_blasts,
                flip=True, tandem_Nmax=tandem_Nmax)
        standems = tandem_grouper(sbed, filtered_blasts, 
                flip=False, tandem_Nmax=tandem_Nmax)

        qdups_fh = open(op.splitext(qbed_file)[0] + ".localdups", "w") if localdups else None

        if is_self:
            for s in standems: qtandems.join(*s)
            qdups_to_mother = write_localdups(qdups_fh, qtandems, qbed)
            sdups_to_mother = qdups_to_mother
        else:
            qdups_to_mother = write_localdups(qdups_fh, qtandems, qbed)
            sdups_fh = open(op.splitext(sbed_file)[0] + ".localdups", "w") if localdups else None
            sdups_to_mother = write_localdups(sdups_fh, standems, sbed)

        if localdups:
            # write out new .bed after tandem removal
            write_new_bed(qbed, qdups_to_mother)
            if not is_self:
                write_new_bed(sbed, sdups_to_mother)
        
        before_filter = len(filtered_blasts)
        filtered_blasts = list(filter_tandem(filtered_blasts, \
                qdups_to_mother, sdups_to_mother))
        print >>sys.stderr, "after filter (%d->%d)..." % \
                (before_filter, len(filtered_blasts))

        qbed.beds = [x for x in qbed if x["accn"] not in qdups_to_mother]
        sbed.beds = [x for x in sbed if x["accn"] not in sdups_to_mother]

        qorder = qbed.get_order()
        sorder = sbed.get_order()

    if filter_repeats:
        before_filter = len(filtered_blasts)
        print >>sys.stderr, "running the repeat filter", 
        filtered_blasts = list(filter_repeat(filtered_blasts))
        print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts))

    if cscore:
        before_filter = len(filtered_blasts)
        print >>sys.stderr, "running the cscore filter (cscore>=%.2f)..." % cscore
        filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore))
        print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts))

    # this is the final output we will write to after BLAST filters
    #raw_name = "%s.raw" % op.splitext(blast_file)[0]
    #raw_fh = open(raw_name, "w")

    #write_raw(qorder, sorder, filtered_blasts, raw_fh)
    write_new_blast(filtered_blasts) 
Ejemplo n.º 2
0
def main(blast_file, options):

    qbed_file, sbed_file = options.qbed, options.sbed

    # is this a self-self blast?
    is_self = (qbed_file == sbed_file)
    if is_self:
        print >> sys.stderr, "... looks like a self-self BLAST to me"

    global_density_ratio = options.global_density_ratio
    tandem_Nmax = options.tandem_Nmax
    filter_repeats = options.filter_repeats
    cscore = options.cscore
    localdups = options.localdups

    print >> sys.stderr, "read annotation files %s and %s" % (qbed_file,
                                                              sbed_file)
    qbed = Bed(qbed_file)
    sbed = Bed(sbed_file)

    qorder = qbed.get_order()
    sorder = sbed.get_order()

    fp = file(blast_file)
    print >>sys.stderr, "read BLAST file %s (total %d lines)" % \
            (blast_file, sum(1 for line in fp))
    fp.seek(0)

    # mdb added 3/18/16 for Last v731
    blasts = []
    for line in fp:
        if not line.startswith("#"):
            blasts.append(BlastLine(line))
    blasts = sorted(blasts, key=lambda b: b.score, reverse=True)

    # mdb removed 3/18/16 for Last v731
    #     blasts = sorted([BlastLine(line) for line in fp], \
    #             key=lambda b: b.score, reverse=True)

    filtered_blasts = []
    seen = set()
    ostrip = options.strip_names
    for b in blasts:
        query, subject = b.query, b.subject
        #if ostrip:
        #    query, subject = gene_name(query), gene_name(subject)
        if query not in qorder:
            print >> sys.stderr, "WARNING: %s not in %s" % (query,
                                                            qbed.filename)
            continue
        if subject not in sorder:
            print >> sys.stderr, "WARNING: %s not in %s" % (subject,
                                                            sbed.filename)
            continue

        qi, q = qorder[query]
        si, s = sorder[subject]

        if is_self and qi > si:
            # move all hits to same side when doing self-self BLAST
            query, subject = subject, query
            qi, si = si, qi
            q, s = s, q

        key = query, subject
        if key in seen: continue
        seen.add(key)
        b.query, b.subject = key

        b.qi, b.si = qi, si
        b.qseqid, b.sseqid = q['seqid'], s['seqid']

        filtered_blasts.append(b)

    if global_density_ratio:
        print >>sys.stderr, "running the global_density filter" + \
                "(global_density_ratio=%d)..." % options.global_density_ratio
        gene_count = len(qorder) + len(sorder)
        before_filter = len(filtered_blasts)
        filtered_blasts = filter_to_global_density(filtered_blasts, gene_count,
                                                   global_density_ratio)
        print >> sys.stderr, "after filter (%d->%d)..." % (
            before_filter, len(filtered_blasts))

    if tandem_Nmax:
        print >> sys.stderr, "running the local dups filter (tandem_Nmax=%d)..." % tandem_Nmax

        qtandems = tandem_grouper(qbed,
                                  filtered_blasts,
                                  flip=True,
                                  tandem_Nmax=tandem_Nmax)
        standems = tandem_grouper(sbed,
                                  filtered_blasts,
                                  flip=False,
                                  tandem_Nmax=tandem_Nmax)

        qdups_fh = open(op.splitext(qbed_file)[0] +
                        ".localdups", "w") if localdups else None

        if is_self:
            for s in standems:
                qtandems.join(*s)
            qdups_to_mother = write_localdups(qdups_fh, qtandems, qbed)
            sdups_to_mother = qdups_to_mother
        else:
            qdups_to_mother = write_localdups(qdups_fh, qtandems, qbed)
            sdups_fh = open(op.splitext(sbed_file)[0] +
                            ".localdups", "w") if localdups else None
            sdups_to_mother = write_localdups(sdups_fh, standems, sbed)

        if localdups:
            # write out new .bed after tandem removal
            write_new_bed(qbed, qdups_to_mother)
            if not is_self:
                write_new_bed(sbed, sdups_to_mother)

        before_filter = len(filtered_blasts)
        filtered_blasts = list(filter_tandem(filtered_blasts, \
                qdups_to_mother, sdups_to_mother))
        print >>sys.stderr, "after filter (%d->%d)..." % \
                (before_filter, len(filtered_blasts))

        qbed.beds = [x for x in qbed if x["accn"] not in qdups_to_mother]
        sbed.beds = [x for x in sbed if x["accn"] not in sdups_to_mother]

        qorder = qbed.get_order()
        sorder = sbed.get_order()

    if filter_repeats:
        before_filter = len(filtered_blasts)
        print >> sys.stderr, "running the repeat filter",
        filtered_blasts = list(filter_repeat(filtered_blasts))
        print >> sys.stderr, "after filter (%d->%d)..." % (
            before_filter, len(filtered_blasts))

    if cscore:
        before_filter = len(filtered_blasts)
        print >> sys.stderr, "running the cscore filter (cscore>=%.2f)..." % cscore
        filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore))
        print >> sys.stderr, "after filter (%d->%d)..." % (
            before_filter, len(filtered_blasts))

    # this is the final output we will write to after BLAST filters
    #raw_name = "%s.raw" % op.splitext(blast_file)[0]
    #raw_fh = open(raw_name, "w")

    #write_raw(qorder, sorder, filtered_blasts, raw_fh)
    write_new_blast(filtered_blasts)