def eval_clique(Q, cursor):
	"""
	Given Q which is a clique containing node indices, look it up
	on the db using cursor.
	
	Like eval_original_fna, returns:
	(fam, # of seqs belonging to fam, clique_size, fam ids hit)
	"""
	FETCH_SQL = "SELECT n.id,s.start,s.end \
				FROM sets_for_nodes s \
				LEFT JOIN nodes_to_index AS n \
				ON (s.nodes_ind=n.ind) WHERE i={i}"

	T = defaultdict(lambda: 0)       # fam ---> hit count
	H = defaultdict(lambda: set())   # fam ---> set of ids hit

	for i in Q:
		cursor.execute( FETCH_SQL.format(i=i) )
		_id,_loc_start,_loc_end = cursor.fetchone()
		(acc,junk),strand,start,end = parsed_accID(_id, True, _loc_start, _loc_end)
		id, fam = get_ribo1( acc, start, end )
		T[fam] += 1
		H[fam].add( id )

	T = T.items()
	T.sort(key=itemgetter(1), reverse=True)
	fam,count = T[0]
	# HACK
	if fam is None and len(T)>1:
		if T[1][1] >= 0.5*len(Q):
			return T[1][0]+'-',T[1][1],len(Q),H[T[1][0]]
		elif T[1][1] >= 3:
			return T[1][0]+'--',T[1][1],len(Q),H[T[1][0]]
	return fam, count, len(Q), H[fam]
def eval_original_fna(fasta_filename):
	"""
	Reads a fasta file and returns (ncRNA family, # of seqs belonging to the family, clique size)
	ncRNA family is determined by plurality, which can be None.
	"""
	tally_by_family = defaultdict(lambda: 0)
	ids_hit_by_family = defaultdict(lambda: set())
	clique_size = 0
	for id in os.popen("grep \"^>\" " + fasta_filename):
		id = id.strip()[1:]
		clique_size += 1
		#id = id[:id.rfind('_')] # what was this for???
		(acc,junk),strand,start,end = parsed_accID(id,True)
		ncRNA_id, ncRNA_family = get_ribo1(acc,start,end)
		tally_by_family[ncRNA_family] += 1
		ids_hit_by_family[ncRNA_family].add( ncRNA_id )
	tally_by_family = tally_by_family.items()
	tally_by_family.sort(key=itemgetter(1))
	fam,count = tally_by_family[-1]
	# HACK HERE!!!
	if fam is None and len(tally_by_family) > 1:
		if tally_by_family[-2][1] >= 0.5*clique_size:
			lesser_fam = tally_by_family[-2][0] + '-'
			return lesser_fam,tally_by_family[-2][1],clique_size,ids_hit_by_family[tally_by_family[-2][0]]
		elif tally_by_family[-2][1] >= 3:
			lesser_fam = tally_by_family[-2][0] + '--'
			return lesser_fam,tally_by_family[-2][1],clique_size,ids_hit_by_family[tally_by_family[-2][0]]
	return fam,count,clique_size,ids_hit_by_family[fam]
def read_cmfinder_motif(motif_filename):
	"""
	Reads a motif file (outputted by CMfinder, stockholm format)
	Returns (ncRNA family, # of motif members belonging to the family, # of motif members)
	ncRNA family is determined by plurality. family is None if plurality are not ncRNAs.
	"""
	tally_by_family = defaultdict(lambda: 0)
	motif_size = 0
	with open(motif_filename) as f:
		f.readline()
		f.readline()
		for line in f:
			if len(line.strip()) == 0:
				continue
			feature, text, annot = line.strip().split(None,2)
			if feature == '#=GS' and annot.startswith('DE'):
				break
			if feature == '#=GS' and annot.startswith('WT'):
				motif_size += 1
				print >> sys.stderr, "looking up id", text
				#id = id[:id.rfind('_')] # what was this for?
				(acc,junk),strand,start,end = parsed_accID(text,True)
				ncRNA_id, ncRNA_family = get_ribo1(acc,start,end)
				tally_by_family[ncRNA_family] += 1
	print >> sys.stderr, "motif filename is", motif_filename
	print >> sys.stderr, "tally by family is", tally_by_family
	tally_by_family = tally_by_family.items()
	tally_by_family.sort(key=itemgetter(1))
	fam,count = tally_by_family[-1]
	return fam,count,motif_size
Example #4
0
def get_tree_info(id_map):
	taxon_map = {}
	gene_seq_map = {}
	species_map = {}
	for label, seqid in id_map.iteritems():
		print >> sys.stderr, "processing ", label, seqid
		idp = miscParses.parsed_accID(seqid)
		# now we annotate it with (a) species name; (b) closest downstream gene description
		r = NCBI.get_acc_gb(idp.acc)
		species_name = r.annotations['organism']
		species_map[idp.acc] = species_name
		if idp.strand == +1:
			gene = NCBI.get_closest_downstream_gene(r, idp.end, idp.strand)
		else:
			gene = NCBI.get_closest_downstream_gene(r, idp.start, idp.strand)
		if gene is None:
			taxon_map[label] = 'Unknown'
			gene_seq_map[seqid] = None
			continue
		# TODO: put this into a phyloXML format!!
		# for now....let's just do normal newick
		gene_seq_map[seqid] = gene
		newname = label + '-' + species_name + '::' 
		if 'product' in gene.qualifiers:
			newname += gene.qualifiers['product'][0]
		elif 'note' in gene.qualifiers:
			newname += gene.qualifiers['note'][0]
		elif 'gene' in gene.qualifiers:
			newname += gene.qualifiers['gene']
		taxon_map[label] = newname
	return taxon_map, gene_seq_map, species_map
Example #5
0
def get_fam_count(fasta_filename):
	from Bio import SeqIO
	from miscParses import parsed_accID
	total_fam_counts = defaultdict(lambda: 0)
        # read the scanned fasta so we know how many ribos per family there are
	# doing this everytime may be a waste, but it ensures we're having the right counts...
	for r in SeqIO.parse(open(fasta_filename), 'fasta'):
		print >> sys.stderr, "fasta reading....", r.id
		(acc, junk_version),strand,start,end = parsed_accID(r.id, True)
		(rb_id, rb_fam) = get_ribo1(acc, start, end)
                total_fam_counts[rb_fam] += 1
	return total_fam_counts
Example #6
0
def map_tree_taxon(tree_filename, id_map):
	taxon_map = {}
	gene_seq_map = {}
	t = dendropy.Tree.get_from_path(tree_filename, 'newick')
	for n in t.leaf_nodes():
		# n.taxon.label is like 'T1', id_map maps back to <acc>/<start>-<end>
		idp = miscParses.parsed_accID(id_map[n.taxon.label])
		# now we annotate it with (a) species name; (b) closest downstream gene description
		r = NCBI.get_acc_gb(idp.acc)
		species_name = r.annotations['organism']
		if idp.strand == +1:
			gene = NCBI.get_closest_downstream_gene(r, idp.end, idp.strand)
		else:
			gene = NCBI.get_closest_downstream_gene(r, idp.start, idp.strand)
		# TODO: put this into a phyloXML format!!
		# for now....let's just do normal newick
		gene_seq_map[n.taxon.label] = gene.qualifiers['translation']
		newname = n.taxon.label + '-' + species_name + '::' + gene.qualifiers['product'][0]
		taxon_map[n.taxon.label] = newname
		n.taxon.label = newname
	return t, taxon_map, gene_seq_map
def evaluate_blast_graph_count_nucleotides(graph_or_filename,hit_ratio=None,ignore_prefix=['shuffled','random']):
	"""
		Similar to evaluate_blast_graph, except that the real ncRNAs (in the DB, not query)
		  are embedded with flanking regions, and the IDs should be in 
		  format <family>_<DB id>_<acc>/<embedded_start>-<embedded_end>

		If <hit_ratio> is None, then for each node N, 
		  sensitivity = (# of real ncRNA-neighbor nts) / (# of real ncRNA nts)
		  specificity = (# of real ncRNA-neighbor nts) / (# of neighbor nts)
		
		If <hit_ratio> is defined, ex: 0.8, then for each node N,
     		  a neighbor node M is a hit if the # of hit on M is >= <hit_ratio>*<M's ncRNA len>

		NOTE: for this kind of blast output, the INPUT should be seq IDs like <family>_<db_id>
		      which means they are real ncRNAs with NO padding
		      and the DB can either be random/shuffled seqIDs
 		      or <family>_<db_id>_<acc>/<embedded_start>-<embedded_end>
	"""
	rex = re.compile('(\S+)_(\d+)_(\S+)')
	rex_real = re.compile('(\S+)_(\d+)')
	from miscncRNA import get_ncRNA_info

	if hit_ratio is not None:
		hit_ratio = float(hit_ratio)

        if type(graph_or_filename) is XGraph:
                X = graph_or_filename
        else:
                X = read_gpickle(graph_or_filename)

	total_nt_by_family = defaultdict(lambda: 0)
        spec_by_family = defaultdict(lambda: [])
        sens_by_family = defaultdict(lambda: [])
	for n in X.nodes_iter():
		if any(map(lambda x: n.startswith(x), ignore_prefix)): continue
		if n.count('_') > 2: continue
		m = rex_real.match(n)
		if m is None: continue
		family,query_db_id = m.group(1),m.group(2)
		tmp_true = defaultdict(lambda: IntervalSet())
		tmp_false = defaultdict(lambda: IntervalSet())

		# the query nodes must be <family>_<db_id> (i.e. no padding)
		info = get_ncRNA_info(query_db_id)
		print >> sys.stderr, n
		total_nt_by_family[family] += info['end']-info['start']+1
		if X.degree(n) == 0: # has 0 neighbors
			sens_by_family[family].append(0)
			spec_by_family[family].append(0)
			continue		
		# e is in format (local_start,local_end,score)
		for (myself,neighbor,e) in X.edges_iter(n):
			if any(map(lambda x: neighbor.startswith(x), ignore_prefix)):
				# not a real ncRNA
				tmp_false[neighbor].add(Interval(e[0],e[1]))
			else:
				m = rex.match(neighbor)
				duncare,db_id,blob = m.group(1),m.group(2),m.group(3)
				if db_id == query_db_id: continue # it's a self vs self-embedded hit, ignore
				(acc,duncare),hit_start,hit_end,hit_strand = parsed_accID(blob,True,e[0],e[1])
				tmp_true[db_id].add(Interval(hit_start,hit_end))

		tp,fp = (0,0)
		for db_id,regions in tmp_true.iteritems():
			info = get_ncRNA_info(db_id)
			for x in regions:
				c = calc_overlap(info['start'],info['end'],x.lower_bound,x.upper_bound)
				if hit_ratio is None:
					tp += c
					fp += (x.upper_bound-x.lower_bound+1) - c
					
				elif c >= hit_ratio*(info['end']-info['start']+1):
					tp += 1
				else:
					fp += 1
		for some_id,regions in tmp_false.iteritems():
			for x in regions: fp += x.upper_bound-x.lower_bound+1

		print >> sys.stderr, tp,fp
		if tp+fp == 0:
			sens_by_family[family].append(0)
			spec_by_family[family].append(0)
		else:
			sens_by_family[family].append(tp) # NOTE: it's raw count!!!
			spec_by_family[family].append(tp*1./(tp+fp))
		#raw_input('...')
	for k in sens_by_family:
		if hit_ratio is None:
			sens_by_family[k] = map(lambda x: x*1./total_nt_by_family[k], sens_by_family[k])
		else:
			sens_by_family[k] = map(lambda x: x*1./len(total_nt_by_family[k]), sens_by_family[k])
	return (None,sens_by_family,spec_by_family)