def collect_best_hits(filename): d = {} for n, record in enumerate(blastparser.parse_fp(open(filename))): if n % 25000 == 0: print >>sys.stderr, '...', filename, n best_score = None for hit in record.hits: for match in hit.matches: query = record.query_name if query.startswith('gi'): query = query.split('|', 2)[2] subject = hit.subject_name score = match.score # only keep the best set of scores for any query if best_score and best_score > score: continue best_score = score x = d.get(query, []) x.append((subject, score)) d[query] = x if best_score and best_score != score: break return d
def collect_best_hits(filename, qfn=None): d = {} for n, record in enumerate(blastparser.parse_fp(open(filename))): if n % 10000 == 0: print '...', n best_score = None for hit in record.hits: for match in hit.matches: query = record.query_name if qfn: query = qfn(query) subject = hit.subject_name score = match.score # only keep the best set of scores for any query if best_score and best_score > score: continue best_score = score x = d.get(query, []) x.append((subject, score)) d[query] = x if best_score and best_score != score: break return d
def read_blast(file_name): hit_names=[] for record in blastparser.parse_fp(open(file_name)): for hit in record: for match in hit.matches: # output each match as a separate row hit_names.append(record.query_name) return hit_names
def read_blast(file_name): hit_names={} for record in blastparser.parse_fp(open(file_name)): for hit in record: for match in hit.matches: # output each match as a separate row # hit_names.append(hit.subject_name) hit_names[record.query_name]=hit.subject_name return hit_names
def collect_blast_hits_by_family(blastfile): d = {} for record in blastparser.parse_fp(open(blastfile)): tr = record.query_name.split('.')[2] assert tr.startswith('tr') tr = int(tr[2:]) collect = [] for hit in record.hits: for match in hit.matches: if match.score >= BITSCORE_CUTOFF: name = hit.subject_name.split('|')[1] collect.append((name, match.score)) x = d.get(tr, []) x.append(collect) d[tr] = x return d
import screed MIN_SCORE = 200 MIN_QUERY_LEN = int(sys.argv[3]) query_seqs = set([record.name for record in screed.open(sys.argv[4]) if len(record.sequence) >= MIN_QUERY_LEN]) covs = {} for n, record in enumerate(screed.open(sys.argv[1])): if n % 1000 == 0: sys.stdout.write("+") sys.stdout.flush() covs[record.name] = [0] * len(record.sequence) for n, record in enumerate(blastparser.parse_fp(open(sys.argv[2]))): if n % 100 == 0: sys.stdout.write(".") sys.stdout.flush() if record.query_name not in query_seqs: continue for hit in record.hits: for match in hit.matches: if match.score < MIN_SCORE: continue cov = covs.get(hit.subject_name) if not cov: continue
MIN_SCORE = 200 MIN_QUERY_LEN = int(sys.argv[3]) query_seqs = set([ record.name for record in screed.open(sys.argv[4]) \ if len(record.sequence) >= MIN_QUERY_LEN ]) covs = {} for n, record in enumerate(screed.open(sys.argv[1])): if n % 1000 == 0: sys.stdout.write('+') sys.stdout.flush() covs[record.name] = [0] * len(record.sequence) for n, record in enumerate(blastparser.parse_fp(open(sys.argv[2]))): if n % 100 == 0: sys.stdout.write('.') sys.stdout.flush() if record.query_name not in query_seqs: continue for hit in record.hits: for match in hit.matches: if match.score < MIN_SCORE: continue cov = covs.get(hit.subject_name) if not cov: continue
#! /usr/bin/env python import sys import csv import blastparser # get the filename as the first argument on the command line filename = sys.argv[1] # open it for reading fp = open(filename) # send output as comma-separated values to stdout output = csv.writer(sys.stdout) # parse BLAST records for record in blastparser.parse_fp(fp): for hit in record: for match in hit.matches: # output each match as a separate row row = [record.query_name, hit.subject_name, match.score, match.expect] output.writerow(row)
ident = record.name d[ident] = record.description return d # open the output file for reading query_seqs = sys.argv[1] against_seqs = sys.argv[2] fp = open(sys.argv[3]) print >>sys.stderr, "reading query seq names from", query_seqs query_db = load_names(query_seqs) print >>sys.stderr, "reading against seq names from", against_seqs against_db = load_names(against_seqs) # send output as comma-separated values to stdout output = csv.writer(sys.stdout) # parse BLAST records print >>sys.stderr, 'parsing BLAST output' for record in blastparser.parse_fp(fp): for hit in record: for match in hit.matches: query_descr = query_db.get(record.query_name, "") against_descr = against_db.get(hit.subject_name, "") # output each match as a separate row row = [record.query_name, query_descr, hit.subject_name, against_descr, match.score, match.expect] output.writerow(row)
#! /usr/bin/env python import sys sys.path.insert(0, '/u/t/dev/blastkit/lib') import blastparser import screed from pygr.sequence import Sequence seqsfile = sys.argv[1] genome_name = seqsfile[:-3] seqdb = screed.ScreedDB(seqsfile) blastfile = 'large.x.' + genome_name for record in blastparser.parse_fp(open(blastfile)): tagname = record.query_name ## for hit in record.hits: for match in hit: seq = Sequence(seqdb[hit.subject_name].sequence, tagname) start, end = match.subject_start, match.subject_end subseq = seq[start:end] print '>%s.%s\n%s' % (genome_name, tagname, subseq) break break
#! /usr/bin/env python import sys import blastparser import screed MIN_SCORE = 200 covs = {} for record in screed.open(sys.argv[1]): covs[record.name] = [0] * len(record.sequence) #print 'before' for record in blastparser.parse_fp(open(sys.argv[2])): #print record #print 'test' #sys.stdout.write('.') #sys.stdout.flush() #print record for hit in record.hits: #print hit for match in hit.matches: cov = covs[hit.subject_name] start = min(match.subject_start, match.subject_end) - 1 end = max(match.subject_start, match.subject_end) for i in range(start, end): cov[i] = 1 print ''
import screed import blastparser import sys infile = sys.argv[1] blast_file = sys.argv[0] lengths={} for n, record in enumerate(screed.open(infile)): lengths[record['name']]=len(record['sequence']) idn, cnt, num = 0,0,0 for hits in blastparser.parse_fp(blast_file): hit_len=0 hit_idn=0 for match in hits[0].matches: hit_len=len(match.query_sequence)+hit_len hit_idn=match.identity+hit_idn hit_idn=hit_idn/float(hits[0]) covered = hit_len/float(lengths[hits.query_name]) if covered >= 1: idn = hit_idn + idn else: idn = hit_idn*covered +idn
#! /usr/bin/env python import sys import blastparser import screed MIN_SCORE=200 covs = {} for record in screed.open(sys.argv[1]): covs[record.name] = [0] * len(record.sequence) #print 'before' for record in blastparser.parse_fp(open(sys.argv[2])): #print record #print 'test' #sys.stdout.write('.') #sys.stdout.flush() #print record for hit in record.hits: #print hit for match in hit.matches: cov = covs[hit.subject_name] start = min(match.subject_start, match.subject_end) - 1 end = max(match.subject_start, match.subject_end) for i in range(start, end): cov[i] = 1
fa_1 = sys.argv[1] fa_2 = sys.argv[2] blast_1x2 = sys.argv[3] blast_2x1 = sys.argv[4] seqs1 = {} seqs2 = {} for record in screed.open(fa_1): seqs1[record.name] = record.sequence for record in screed.open(fa_2): seqs2[record.name] = record.sequence for b in blastparser.parse_fp(open(blast_1x2)): assert b.query_name in seqs1 del seqs1[b.query_name] for b in blastparser.parse_fp(open(blast_2x1)): assert b.query_name in seqs2 del seqs2[b.query_name] print len(seqs1), 'missing 1' print len(seqs2), 'missing 2' fp = open(blast_1x2 + '.missing', 'w') for name in seqs1: fp.write(">%s\n%s\n" % (name, seqs1[name])) fp.close()