Example #1
0
def collect_best_hits(filename):
    d = {}
    for n, record in enumerate(blastparser.parse_fp(open(filename))):
        if n % 25000 == 0:
            print >>sys.stderr, '...', filename, n
        best_score = None
        for hit in record.hits:
            for match in hit.matches:
                query = record.query_name
                if query.startswith('gi'):
                    query = query.split('|', 2)[2]
                subject = hit.subject_name

                score = match.score

                # only keep the best set of scores for any query
                if best_score and best_score > score:
                    continue
                best_score = score

                x = d.get(query, [])
                x.append((subject, score))
                d[query] = x

            if best_score and best_score != score:
                break
    return d
Example #2
0
def collect_best_hits(filename, qfn=None):
    d = {}
    for n, record in enumerate(blastparser.parse_fp(open(filename))):
        if n % 10000 == 0:
            print '...', n
        best_score = None
        for hit in record.hits:
            for match in hit.matches:
                query = record.query_name
                if qfn:
                    query = qfn(query)
                subject = hit.subject_name
                score = match.score

                # only keep the best set of scores for any query
                if best_score and best_score > score:
                    continue
                best_score = score

                x = d.get(query, [])
                x.append((subject, score))
                d[query] = x

            if best_score and best_score != score:
                break
    return d
def read_blast(file_name):
    hit_names=[]
    for record in blastparser.parse_fp(open(file_name)):
        for hit in record:
            for match in hit.matches:
                # output each match as a separate row
                hit_names.append(record.query_name)
    return hit_names
def read_blast(file_name):
    hit_names={}
    for record in blastparser.parse_fp(open(file_name)):
        for hit in record:
            for match in hit.matches:
                # output each match as a separate row
#                hit_names.append(hit.subject_name)
                hit_names[record.query_name]=hit.subject_name
    return hit_names
def collect_blast_hits_by_family(blastfile):
    d = {}
    for record in blastparser.parse_fp(open(blastfile)):
        tr = record.query_name.split('.')[2]
        assert tr.startswith('tr')
        tr = int(tr[2:])

        collect = []
        for hit in record.hits:
            for match in hit.matches:
                if match.score >= BITSCORE_CUTOFF:
                    name = hit.subject_name.split('|')[1]
                    collect.append((name, match.score))

        x = d.get(tr, [])
        x.append(collect)
        d[tr] = x

    return d
def collect_blast_hits_by_family(blastfile):
    d = {}
    for record in blastparser.parse_fp(open(blastfile)):
        tr = record.query_name.split('.')[2]
        assert tr.startswith('tr')
        tr = int(tr[2:])

        collect = []
        for hit in record.hits:
            for match in hit.matches:
                if match.score >= BITSCORE_CUTOFF:
                    name = hit.subject_name.split('|')[1]
                    collect.append((name, match.score))

        x = d.get(tr, [])
        x.append(collect)
        d[tr] = x

    return d
import screed

MIN_SCORE = 200
MIN_QUERY_LEN = int(sys.argv[3])

query_seqs = set([record.name for record in screed.open(sys.argv[4]) if len(record.sequence) >= MIN_QUERY_LEN])

covs = {}
for n, record in enumerate(screed.open(sys.argv[1])):
    if n % 1000 == 0:
        sys.stdout.write("+")
        sys.stdout.flush()

    covs[record.name] = [0] * len(record.sequence)

for n, record in enumerate(blastparser.parse_fp(open(sys.argv[2]))):
    if n % 100 == 0:
        sys.stdout.write(".")
        sys.stdout.flush()

    if record.query_name not in query_seqs:
        continue

    for hit in record.hits:
        for match in hit.matches:
            if match.score < MIN_SCORE:
                continue

            cov = covs.get(hit.subject_name)
            if not cov:
                continue
MIN_SCORE = 200
MIN_QUERY_LEN = int(sys.argv[3])

query_seqs = set([ record.name for record in screed.open(sys.argv[4]) \
                       if len(record.sequence) >= MIN_QUERY_LEN ])

covs = {}
for n, record in enumerate(screed.open(sys.argv[1])):
    if n % 1000 == 0:
        sys.stdout.write('+')
        sys.stdout.flush()

    covs[record.name] = [0] * len(record.sequence)

for n, record in enumerate(blastparser.parse_fp(open(sys.argv[2]))):
    if n % 100 == 0:
        sys.stdout.write('.')
        sys.stdout.flush()

    if record.query_name not in query_seqs:
        continue

    for hit in record.hits:
        for match in hit.matches:
            if match.score < MIN_SCORE:
                continue

            cov = covs.get(hit.subject_name)
            if not cov:
                continue
Example #9
0
#! /usr/bin/env python
import sys
import csv
import blastparser

# get the filename as the first argument on the command line
filename = sys.argv[1]

# open it for reading
fp = open(filename)

# send output as comma-separated values to stdout
output = csv.writer(sys.stdout)

# parse BLAST records
for record in blastparser.parse_fp(fp):
    for hit in record:
        for match in hit.matches:
            # output each match as a separate row
            row = [record.query_name, hit.subject_name, match.score,
                   match.expect]
            output.writerow(row)
           ident = record.name
        d[ident] = record.description
    return d
 
# open the output file for reading
query_seqs = sys.argv[1]
against_seqs = sys.argv[2]

fp = open(sys.argv[3])

print >>sys.stderr, "reading query seq names from", query_seqs
query_db = load_names(query_seqs)
print >>sys.stderr, "reading against seq names from", against_seqs
against_db = load_names(against_seqs)
 
# send output as comma-separated values to stdout
output = csv.writer(sys.stdout)
 
# parse BLAST records
print >>sys.stderr, 'parsing BLAST output'
for record in blastparser.parse_fp(fp):
    for hit in record:
        for match in hit.matches:
            query_descr = query_db.get(record.query_name, "")
            against_descr = against_db.get(hit.subject_name, "")
            # output each match as a separate row
            row = [record.query_name, query_descr,
                   hit.subject_name, against_descr, match.score,
                   match.expect]
            output.writerow(row)
Example #11
0
#! /usr/bin/env python
import sys
sys.path.insert(0, '/u/t/dev/blastkit/lib')
import blastparser
import screed
from pygr.sequence import Sequence

seqsfile = sys.argv[1]
genome_name = seqsfile[:-3]
seqdb = screed.ScreedDB(seqsfile)

blastfile = 'large.x.' + genome_name
for record in blastparser.parse_fp(open(blastfile)):
    tagname = record.query_name ##
    for hit in record.hits:
        for match in hit:
            seq = Sequence(seqdb[hit.subject_name].sequence, tagname)
            start, end = match.subject_start, match.subject_end
            subseq = seq[start:end]

            print '>%s.%s\n%s' % (genome_name, tagname, subseq)
            break
        break
Example #12
0
#! /usr/bin/env python
import sys
import blastparser
import screed

MIN_SCORE = 200

covs = {}
for record in screed.open(sys.argv[1]):
    covs[record.name] = [0] * len(record.sequence)

#print 'before'
for record in blastparser.parse_fp(open(sys.argv[2])):
    #print record
    #print 'test'
    #sys.stdout.write('.')
    #sys.stdout.flush()

    #print record
    for hit in record.hits:
        #print hit
        for match in hit.matches:

            cov = covs[hit.subject_name]
            start = min(match.subject_start, match.subject_end) - 1
            end = max(match.subject_start, match.subject_end)
            for i in range(start, end):
                cov[i] = 1

print ''
import screed
import blastparser
import sys

infile = sys.argv[1]
blast_file = sys.argv[0]

lengths={}
for n, record in enumerate(screed.open(infile)):
    lengths[record['name']]=len(record['sequence'])

idn, cnt, num = 0,0,0
for hits in blastparser.parse_fp(blast_file):
    hit_len=0
    hit_idn=0
    for match in hits[0].matches:
        hit_len=len(match.query_sequence)+hit_len
        hit_idn=match.identity+hit_idn
    hit_idn=hit_idn/float(hits[0])
    covered = hit_len/float(lengths[hits.query_name])
    if covered >= 1:
        idn = hit_idn + idn
    else:
        idn = hit_idn*covered +idn
Example #14
0
#! /usr/bin/env python
import sys
import blastparser
import screed


MIN_SCORE=200

covs = {}
for record in screed.open(sys.argv[1]):
    covs[record.name] = [0] * len(record.sequence)

#print 'before'
for record in blastparser.parse_fp(open(sys.argv[2])):
    #print record
    #print 'test'
    #sys.stdout.write('.')
    #sys.stdout.flush()

    #print record
    for hit in record.hits:
        #print hit
        for match in hit.matches:
            

            cov = covs[hit.subject_name]
            start = min(match.subject_start, match.subject_end) - 1
            end = max(match.subject_start, match.subject_end)
            for i in range(start, end):
                cov[i] = 1
fa_1 = sys.argv[1]
fa_2 = sys.argv[2]
blast_1x2 = sys.argv[3]
blast_2x1 = sys.argv[4]

seqs1 = {}
seqs2 = {}

for record in screed.open(fa_1):
    seqs1[record.name] = record.sequence

for record in screed.open(fa_2):
    seqs2[record.name] = record.sequence

for b in blastparser.parse_fp(open(blast_1x2)):
    assert b.query_name in seqs1
    del seqs1[b.query_name]

for b in blastparser.parse_fp(open(blast_2x1)):
    assert b.query_name in seqs2
    del seqs2[b.query_name]

print len(seqs1), 'missing 1'
print len(seqs2), 'missing 2'

fp = open(blast_1x2 + '.missing', 'w')
for name in seqs1:
    fp.write(">%s\n%s\n" % (name, seqs1[name]))
fp.close()