import os,re,sys,miscMySQL,miscTaxonomy
from collections import defaultdict
from decimal import Context, localcontext

conn = miscMySQL.get_conn_ncRNA()
d_cur = miscMySQL.get_dict_cursor(conn)
connNCBI = miscMySQL.get_conn_NCBI()
curNCBI = connNCBI.cursor()

def tally():
	result = defaultdict(lambda: defaultdict(lambda: ''))
	d_cur.execute("select family,count(*) as total from Zasha_20081002_plus20071102_curated group by family")
	for r in d_cur.fetchall():
		result[r['family']]['total'] = r['total']
	d_cur.execute("select z.family as family,c.phylum as phylum, count(*) as count from Zasha_20081002_plus20071102_curated as z left join NCBI.cache_acc_to_tax as c on (z.acc=c.acc) group by z.family,c.phylum")
	for r in d_cur.fetchall():
		result[r['family']][r['phylum']] = r['count']

	d_cur.execute("select distinct phylum from NCBI.cache_acc_to_tax order by phylum")
	phyla = map(lambda x: x['phylum'],d_cur.fetchall())
	print("\tTOTAL\t"+"\t".join(phyla))
	for family,d in result.iteritems():
		print("{0}\t{1}".format(family,d['total'])),
		for p in phyla:
			print("\t"+str(d[p])),
		print ''

tally()
sys.exit(-1)

# here's what we're interested in:
Esempio n. 2
0
from miscMySQL import get_conn_Actino, get_dict_cursor
from collections import defaultdict
from operator import itemgetter
from scipy import histogram
"""
	This tmp script is used to do evaluation on the cliques we generated
"""
clique_filename='output/output_cliques/ALLActino_RefSeq25_m30s0_cut35gamma80.cliques.pickle'
# here we keep track of some stats
ncRNA_ids_seen = defaultdict(lambda: 0)
clique_stats = defaultdict(lambda: {'sizes':[],'precisions':[]})
ncRNA_id_to_family = {}
hitQ_sizes = []

conn = get_conn_Actino()
cursor = get_dict_cursor(conn)
with open(clique_filename) as f:
	QQQ = load(f)
	for Q in QQQ:
		if 75 in Q or 163 in Q: continue
		if len(Q) < 5:
			continue
		tally_by_family = defaultdict(lambda: 0)
		for i in Q:
			# if it's not a hit, will return (None,None)
			(ncRNA_id, ncRNA_family) = c2.check_hit(i, cursor)
			tally_by_family[ncRNA_family] += 1
			ncRNA_ids_seen[ncRNA_id] += 1
			ncRNA_id_to_family[ncRNA_id] = ncRNA_family
		# decide the dominant family of this cluster
		tally_by_family = tally_by_family.items()