def extract_best_pair_hits(blast_output,program='WU'):
	"""
	For each pair of (query,target) hits, regardless of hit region length,
	take only the highest scoring pair
	"""
	if program!='WU': raise Exception, 'temporarily does not support non-WUBLAST output!(TODO)'	
	
	res = defaultdict(lambda: 0)
	binned_scores = defaultdict(lambda: 0)
	with open(blast_output) as f:
		for line in f:
			hit_dict = parseWUBLASTline(line)
			res[(hit_dict['id1'],hit_dict['id2'])] = max(res[(hit_dict['id1'],hit_dict['id2'])],hit_dict['sprime'])
	for v in res.itervalues():
		binned_scores[int(round(v))] += 1

        # now plot it
        x = binned_scores.keys()
        x.sort()
        y = [binned_scores[k] for k in x]
        for i in xrange(len(x)):
                print x[i],y[i]
        import Gnuplot
        p = Gnuplot.Gnuplot()
        p('set terminal png')
        p("set xra [0:{0}]".format(max(x)+10))
        p("set yra [0:{0}]".format(max(y)+10))
        p("set out '{0}_unique_score_dist.png'".format(blast_output))
        D = Gnuplot.Data(x,y,with_=" lines lt -1 lw 1")
        p.plot(D)
        p('set out')
def scatterplot_phylodist_to_score(blast_dirname,filename_rex,phylo_dirname,NONMEMBER_DIST=10.):
	"""
		Similar to scatterplot_length_to_maxscore, except that we're plotting
		the branch distance to maximum observed score. And we do it PER family
		blast file since branch distances make sense only within a family....

		Also:
		  We assume that the blast files not only match filename_rex, but also
		  ends in '.WUblast'

		  We assume that under <phylo_dirname>, there will only be 1 .outtree file
		  that fits the filename_rex-extracted family name

		  We also assume that this blast output is a "real" vs "padded_real + negative",
		  so id1 is always not padded, and id2 is potentially padded
	"""
	for filename in os.listdir(blast_dirname):
		if os.path.exists(blast_dirname+'/'+filename+'.phylodist_to_score'):
			continue

		maxscore_of_dist = defaultdict(lambda: float("-inf"))
		m = filename_rex.match(filename)
		if m is None or not filename.endswith('WUblast'):
			continue
		family = m.group(1)
		phylo_filename = fnmatch.filter(os.listdir(phylo_dirname),"*{0}*.outtree".format(family))[0]
		p = NewickReader(phylo_dirname+'/'+phylo_filename)
		print >> sys.stderr, filename
		with open(blast_dirname+'/'+filename,'r') as f:
			for line in f:
				d = parseWUBLASTline(line)
				id1,id2 = d['id1'],d['id2']
				if id1[:id1.find('_')]==id2[:id2.find('_')] and id1.startswith(family):
					try:
						dist = p.distance(parse_id(id1,True)[1], parse_id(id2,False)[1])
						maxscore_of_dist[dist] = max(maxscore_of_dist[dist], d['sprime'])
					except:
						print >> sys.stderr, "failed phylo dist on ",id1,id2
				else: # a non-hit
					maxscore_of_dist[NONMEMBER_DIST] = max(maxscore_of_dist[NONMEMBER_DIST], d['sprime'])

		dists = maxscore_of_dist.keys()
		dists.sort()
		with open(blast_dirname+'/'+filename+'.phylodist_to_score','w') as f:
			for k in dists:
				f.write("{0}\t{1}\n".format(k, maxscore_of_dist[k]))
def blast_to_graph(real_fna_input,blast_output,score_cutoff,multi_edge,program='WU'):
	"""
		Make a node for each seq in <real_fna_input> (so we'll know loner nodes)
		Read <blast_output>, for each hit >= score_cutoff,
 		  make an edge (n1,n2) where n1,n2 are query & target seq IDs,

		If multi_edge is False, then
		  the edge weight is the (maximum observed) hit score
		Otherwise,
 		  the edge(s) are (hit_local_start,hit_local_end,hit_score)
		
		Stores the networkx XGraph in a pickle <blast_output>_cut<score_cutoff>.gpickle
		Then returns the pickle filename
	"""
	if program!='WU': raise Exception, 'temporarily does not support non-WUBLAST output!(TODO)'	
	
	X = XGraph()
	X.ban_selfloops() # this is for ignoring self-hits
	if multi_edge:
		X.allow_multiedges()
	else:
		X.ban_multiedges()

	with open(real_fna_input) as f:
		for r in SeqIO.parse(f,'fasta'):
			X.add_node(r.id)

	with open(blast_output) as f:
		for line in f:
			hit_dict = parseWUBLASTline(line)
			if hit_dict['sprime'] < score_cutoff: continue
			if multi_edge:
				if hit_dict['id1'] < hit_dict['id2']:
					X.add_edge(hit_dict['id1'],hit_dict['id2'],((hit_dict['start1'],hit_dict['end1']),(hit_dict['start2'],hit_dict['end2']),hit_dict['sprime']))
				else:
					X.add_edge(hit_dict['id1'],hit_dict['id2'],((hit_dict['start2'],hit_dict['end2']),(hit_dict['start1'],hit_dict['end1']),hit_dict['sprime']))
			elif (not X.has_edge(hit_dict['id1'],hit_dict['id2'])) or X.get_edge(hit_dict['id1'],hit_dict['id2']) < hit_dict['sprime']:
				X.add_edge(hit_dict['id1'],hit_dict['id2'],hit_dict['sprime'])

		
	gpickle_filename = blast_output+'_cut'+str(score_cutoff)+'.gpickle'
	write_gpickle(X,gpickle_filename)
	return gpickle_filename
def scatterplot_length_to_maxscore(dirname,filename_pattern):
	"""
		Read through ALL files matching <filename_pattern> description
		under directory <dirname>

		Assumes the files are in WU-BLAST format		
		Then plots a scatterplot where 
		  x-axis is the length(query+target seq)
		  y-axis is the max score observed for SOME pairs of seq where sum of length is x
	"""
	maxscore_of_length = defaultdict(lambda: 10)
	for filename in fnmatch.filter(os.listdir(dirname),filename_pattern):
		print >> sys.stderr, filename
		with open(dirname+'/'+filename) as f:
			for line in f:
				hit_dict = parseWUBLASTline(line)				
				l1 = abs(hit_dict['end2']-hit_dict['start2'])+1
				l2 = abs(hit_dict['end1']-hit_dict['start1'])+1
				l = (l1+l2)/2
				maxscore_of_length[l] = min(maxscore_of_length[l], hit_dict['e'])
	for l,s in maxscore_of_length.iteritems():
		print l,s