コード例 #1
0
def scatterplot_phylodist_to_score(blast_dirname,filename_rex,phylo_dirname,NONMEMBER_DIST=10.):
	"""
		Similar to scatterplot_length_to_maxscore, except that we're plotting
		the branch distance to maximum observed score. And we do it PER family
		blast file since branch distances make sense only within a family....

		Also:
		  We assume that the blast files not only match filename_rex, but also
		  ends in '.WUblast'

		  We assume that under <phylo_dirname>, there will only be 1 .outtree file
		  that fits the filename_rex-extracted family name

		  We also assume that this blast output is a "real" vs "padded_real + negative",
		  so id1 is always not padded, and id2 is potentially padded
	"""
	for filename in os.listdir(blast_dirname):
		if os.path.exists(blast_dirname+'/'+filename+'.phylodist_to_score'):
			continue

		maxscore_of_dist = defaultdict(lambda: float("-inf"))
		m = filename_rex.match(filename)
		if m is None or not filename.endswith('WUblast'):
			continue
		family = m.group(1)
		phylo_filename = fnmatch.filter(os.listdir(phylo_dirname),"*{0}*.outtree".format(family))[0]
		p = NewickReader(phylo_dirname+'/'+phylo_filename)
		print >> sys.stderr, filename
		with open(blast_dirname+'/'+filename,'r') as f:
			for line in f:
				d = parseWUBLASTline(line)
				id1,id2 = d['id1'],d['id2']
				if id1[:id1.find('_')]==id2[:id2.find('_')] and id1.startswith(family):
					try:
						dist = p.distance(parse_id(id1,True)[1], parse_id(id2,False)[1])
						maxscore_of_dist[dist] = max(maxscore_of_dist[dist], d['sprime'])
					except:
						print >> sys.stderr, "failed phylo dist on ",id1,id2
				else: # a non-hit
					maxscore_of_dist[NONMEMBER_DIST] = max(maxscore_of_dist[NONMEMBER_DIST], d['sprime'])

		dists = maxscore_of_dist.keys()
		dists.sort()
		with open(blast_dirname+'/'+filename+'.phylodist_to_score','w') as f:
			for k in dists:
				f.write("{0}\t{1}\n".format(k, maxscore_of_dist[k]))
コード例 #2
0
def evaluate_blast_graph_by_phylo(graph_or_filename,phylo_filename,ignore_prefix=['shuffled_','random_']):
	"""
		Reads the blast-processed XGraph (or the filename of the pickled XGraph)
		The REAL ncRNA seq IDs should be in format <family>_<DB id>
		The FAKE seq IDs should be in format <ignore_prefix>xxxxx.....
	
		Returns (result,sens_by_family,spec_by_family)
		where result is an obsolete junk (so just make it None) for now
		phylo_sum_by_family is dict with family --> list of sums of neighbor phylo distances 
							    for all nodes in this family
		spec_by_family is dict with family --> list of spec(ratio) for all nodes in this family
	"""
	if type(graph_or_filename) is XGraph:
		X = graph_or_filename
	else:
		X = read_gpickle(graph_or_filename)

	p = NewickReader(phylo_filename)	
	
	phylo_sum_by_family = defaultdict(lambda: [])
	spec_by_family = defaultdict(lambda: [])
	total_size_by_family = defaultdict(lambda: 0)
	for n in X.nodes_iter():
		i = n.rfind('_')
		family,db_id = n[:i],n[(i+1):]
		if any(map(lambda x: family.startswith(x), ignore_prefix)): continue
		total_size_by_family[family] += 1
		if X.degree(n) == 0:
			spec_by_family[family].append(0)
			phylo_sum_by_family[family].append(0)
			continue
		phylo_sum,tp = 0,0
		for m in X.neighbors_iter(n):
			if any(map(lambda x: family.startswith(x), ignore_prefix)): continue
			m1 = parse_id(m,True)			
			if m1[0] == family:
				try:
					phylo_sum += p.distance(db_id,m1[1])
				except:
					pass
				tp += 1	
		phylo_sum_by_family[family].append(phylo_sum)		
		spec_by_family[family].append(tp*1./X.degree(n))

	return (None,phylo_sum_by_family,spec_by_family)