import os,re,sys,miscMySQL,miscTaxonomy from collections import defaultdict from decimal import Context, localcontext conn = miscMySQL.get_conn_ncRNA() d_cur = miscMySQL.get_dict_cursor(conn) connNCBI = miscMySQL.get_conn_NCBI() curNCBI = connNCBI.cursor() def tally(): result = defaultdict(lambda: defaultdict(lambda: '')) d_cur.execute("select family,count(*) as total from Zasha_20081002_plus20071102_curated group by family") for r in d_cur.fetchall(): result[r['family']]['total'] = r['total'] d_cur.execute("select z.family as family,c.phylum as phylum, count(*) as count from Zasha_20081002_plus20071102_curated as z left join NCBI.cache_acc_to_tax as c on (z.acc=c.acc) group by z.family,c.phylum") for r in d_cur.fetchall(): result[r['family']][r['phylum']] = r['count'] d_cur.execute("select distinct phylum from NCBI.cache_acc_to_tax order by phylum") phyla = map(lambda x: x['phylum'],d_cur.fetchall()) print("\tTOTAL\t"+"\t".join(phyla)) for family,d in result.iteritems(): print("{0}\t{1}".format(family,d['total'])), for p in phyla: print("\t"+str(d[p])), print '' tally() sys.exit(-1) # here's what we're interested in:
from miscMySQL import get_conn_Actino, get_dict_cursor from collections import defaultdict from operator import itemgetter from scipy import histogram """ This tmp script is used to do evaluation on the cliques we generated """ clique_filename='output/output_cliques/ALLActino_RefSeq25_m30s0_cut35gamma80.cliques.pickle' # here we keep track of some stats ncRNA_ids_seen = defaultdict(lambda: 0) clique_stats = defaultdict(lambda: {'sizes':[],'precisions':[]}) ncRNA_id_to_family = {} hitQ_sizes = [] conn = get_conn_Actino() cursor = get_dict_cursor(conn) with open(clique_filename) as f: QQQ = load(f) for Q in QQQ: if 75 in Q or 163 in Q: continue if len(Q) < 5: continue tally_by_family = defaultdict(lambda: 0) for i in Q: # if it's not a hit, will return (None,None) (ncRNA_id, ncRNA_family) = c2.check_hit(i, cursor) tally_by_family[ncRNA_family] += 1 ncRNA_ids_seen[ncRNA_id] += 1 ncRNA_id_to_family[ncRNA_id] = ncRNA_family # decide the dominant family of this cluster tally_by_family = tally_by_family.items()