def label_sets_for_nodes(cursor): """ Returns <sets_for_nodes> id --> ncRNA family (can be None) """ c2.CONN_FUNC = CONN_FUNC nodes_label = {} cursor.execute("SELECT i FROM sets_for_nodes") for r in cursor.fetchall(): print >> sys.stderr, "labeling for node....", r[0] ncRNA_id, ncRNA_fam = c2.check_hit(r[0]) nodes_label[r[0]] = ncRNA_fam return nodes_label
def graph_ncRNA_connectivity(clique_pickle=None): """ Looks at table:parsed, and for each ncRNA of family F, calculate the ratio of connected F members/Nones, and do this for both immediately connected nodes and path-2 nodes. """ c2.CONN_FUNC = CONN_FUNC ncRNA_map = defaultdict(lambda: []) # fam --> list of <i>s dum = 0 with CONN_FUNC() as cursor: cursor.execute("SELECT i FROM sets_for_nodes") for r in cursor.fetchall(): print >> sys.stderr, "i is....", r[0] id,fam = c2.check_hit(r[0]) if id is not None: ncRNA_map[fam].append(id) dum += 1 if dum > 100: break for fam,list_of_i in ncRNA_map.iteritems(): get_connectivity(list_of_i, cursor)
clique_stats = defaultdict(lambda: {'sizes':[],'precisions':[]}) ncRNA_id_to_family = {} hitQ_sizes = [] conn = get_conn_Actino() cursor = get_dict_cursor(conn) with open(clique_filename) as f: QQQ = load(f) for Q in QQQ: if 75 in Q or 163 in Q: continue if len(Q) < 5: continue tally_by_family = defaultdict(lambda: 0) for i in Q: # if it's not a hit, will return (None,None) (ncRNA_id, ncRNA_family) = c2.check_hit(i, cursor) tally_by_family[ncRNA_family] += 1 ncRNA_ids_seen[ncRNA_id] += 1 ncRNA_id_to_family[ncRNA_id] = ncRNA_family # decide the dominant family of this cluster tally_by_family = tally_by_family.items() tally_by_family.sort(key=itemgetter(1)) fam,count = tally_by_family[-1] if fam is not None: print("{0}\t{1}/{2}".format(fam,count,len(Q))) hitQ_sizes.append(len(Q)) clique_stats[fam]['sizes'].append(len(Q)) clique_stats[fam]['precisions'].append(count*1./len(Q)) # now print the stats print('####################### stats #########################')