def extract_best_pair_hits(blast_output,program='WU'): """ For each pair of (query,target) hits, regardless of hit region length, take only the highest scoring pair """ if program!='WU': raise Exception, 'temporarily does not support non-WUBLAST output!(TODO)' res = defaultdict(lambda: 0) binned_scores = defaultdict(lambda: 0) with open(blast_output) as f: for line in f: hit_dict = parseWUBLASTline(line) res[(hit_dict['id1'],hit_dict['id2'])] = max(res[(hit_dict['id1'],hit_dict['id2'])],hit_dict['sprime']) for v in res.itervalues(): binned_scores[int(round(v))] += 1 # now plot it x = binned_scores.keys() x.sort() y = [binned_scores[k] for k in x] for i in xrange(len(x)): print x[i],y[i] import Gnuplot p = Gnuplot.Gnuplot() p('set terminal png') p("set xra [0:{0}]".format(max(x)+10)) p("set yra [0:{0}]".format(max(y)+10)) p("set out '{0}_unique_score_dist.png'".format(blast_output)) D = Gnuplot.Data(x,y,with_=" lines lt -1 lw 1") p.plot(D) p('set out')
def scatterplot_phylodist_to_score(blast_dirname,filename_rex,phylo_dirname,NONMEMBER_DIST=10.): """ Similar to scatterplot_length_to_maxscore, except that we're plotting the branch distance to maximum observed score. And we do it PER family blast file since branch distances make sense only within a family.... Also: We assume that the blast files not only match filename_rex, but also ends in '.WUblast' We assume that under <phylo_dirname>, there will only be 1 .outtree file that fits the filename_rex-extracted family name We also assume that this blast output is a "real" vs "padded_real + negative", so id1 is always not padded, and id2 is potentially padded """ for filename in os.listdir(blast_dirname): if os.path.exists(blast_dirname+'/'+filename+'.phylodist_to_score'): continue maxscore_of_dist = defaultdict(lambda: float("-inf")) m = filename_rex.match(filename) if m is None or not filename.endswith('WUblast'): continue family = m.group(1) phylo_filename = fnmatch.filter(os.listdir(phylo_dirname),"*{0}*.outtree".format(family))[0] p = NewickReader(phylo_dirname+'/'+phylo_filename) print >> sys.stderr, filename with open(blast_dirname+'/'+filename,'r') as f: for line in f: d = parseWUBLASTline(line) id1,id2 = d['id1'],d['id2'] if id1[:id1.find('_')]==id2[:id2.find('_')] and id1.startswith(family): try: dist = p.distance(parse_id(id1,True)[1], parse_id(id2,False)[1]) maxscore_of_dist[dist] = max(maxscore_of_dist[dist], d['sprime']) except: print >> sys.stderr, "failed phylo dist on ",id1,id2 else: # a non-hit maxscore_of_dist[NONMEMBER_DIST] = max(maxscore_of_dist[NONMEMBER_DIST], d['sprime']) dists = maxscore_of_dist.keys() dists.sort() with open(blast_dirname+'/'+filename+'.phylodist_to_score','w') as f: for k in dists: f.write("{0}\t{1}\n".format(k, maxscore_of_dist[k]))
def blast_to_graph(real_fna_input,blast_output,score_cutoff,multi_edge,program='WU'): """ Make a node for each seq in <real_fna_input> (so we'll know loner nodes) Read <blast_output>, for each hit >= score_cutoff, make an edge (n1,n2) where n1,n2 are query & target seq IDs, If multi_edge is False, then the edge weight is the (maximum observed) hit score Otherwise, the edge(s) are (hit_local_start,hit_local_end,hit_score) Stores the networkx XGraph in a pickle <blast_output>_cut<score_cutoff>.gpickle Then returns the pickle filename """ if program!='WU': raise Exception, 'temporarily does not support non-WUBLAST output!(TODO)' X = XGraph() X.ban_selfloops() # this is for ignoring self-hits if multi_edge: X.allow_multiedges() else: X.ban_multiedges() with open(real_fna_input) as f: for r in SeqIO.parse(f,'fasta'): X.add_node(r.id) with open(blast_output) as f: for line in f: hit_dict = parseWUBLASTline(line) if hit_dict['sprime'] < score_cutoff: continue if multi_edge: if hit_dict['id1'] < hit_dict['id2']: X.add_edge(hit_dict['id1'],hit_dict['id2'],((hit_dict['start1'],hit_dict['end1']),(hit_dict['start2'],hit_dict['end2']),hit_dict['sprime'])) else: X.add_edge(hit_dict['id1'],hit_dict['id2'],((hit_dict['start2'],hit_dict['end2']),(hit_dict['start1'],hit_dict['end1']),hit_dict['sprime'])) elif (not X.has_edge(hit_dict['id1'],hit_dict['id2'])) or X.get_edge(hit_dict['id1'],hit_dict['id2']) < hit_dict['sprime']: X.add_edge(hit_dict['id1'],hit_dict['id2'],hit_dict['sprime']) gpickle_filename = blast_output+'_cut'+str(score_cutoff)+'.gpickle' write_gpickle(X,gpickle_filename) return gpickle_filename
def scatterplot_length_to_maxscore(dirname,filename_pattern): """ Read through ALL files matching <filename_pattern> description under directory <dirname> Assumes the files are in WU-BLAST format Then plots a scatterplot where x-axis is the length(query+target seq) y-axis is the max score observed for SOME pairs of seq where sum of length is x """ maxscore_of_length = defaultdict(lambda: 10) for filename in fnmatch.filter(os.listdir(dirname),filename_pattern): print >> sys.stderr, filename with open(dirname+'/'+filename) as f: for line in f: hit_dict = parseWUBLASTline(line) l1 = abs(hit_dict['end2']-hit_dict['start2'])+1 l2 = abs(hit_dict['end1']-hit_dict['start1'])+1 l = (l1+l2)/2 maxscore_of_length[l] = min(maxscore_of_length[l], hit_dict['e']) for l,s in maxscore_of_length.iteritems(): print l,s