def scatterplot_phylodist_to_score(blast_dirname,filename_rex,phylo_dirname,NONMEMBER_DIST=10.): """ Similar to scatterplot_length_to_maxscore, except that we're plotting the branch distance to maximum observed score. And we do it PER family blast file since branch distances make sense only within a family.... Also: We assume that the blast files not only match filename_rex, but also ends in '.WUblast' We assume that under <phylo_dirname>, there will only be 1 .outtree file that fits the filename_rex-extracted family name We also assume that this blast output is a "real" vs "padded_real + negative", so id1 is always not padded, and id2 is potentially padded """ for filename in os.listdir(blast_dirname): if os.path.exists(blast_dirname+'/'+filename+'.phylodist_to_score'): continue maxscore_of_dist = defaultdict(lambda: float("-inf")) m = filename_rex.match(filename) if m is None or not filename.endswith('WUblast'): continue family = m.group(1) phylo_filename = fnmatch.filter(os.listdir(phylo_dirname),"*{0}*.outtree".format(family))[0] p = NewickReader(phylo_dirname+'/'+phylo_filename) print >> sys.stderr, filename with open(blast_dirname+'/'+filename,'r') as f: for line in f: d = parseWUBLASTline(line) id1,id2 = d['id1'],d['id2'] if id1[:id1.find('_')]==id2[:id2.find('_')] and id1.startswith(family): try: dist = p.distance(parse_id(id1,True)[1], parse_id(id2,False)[1]) maxscore_of_dist[dist] = max(maxscore_of_dist[dist], d['sprime']) except: print >> sys.stderr, "failed phylo dist on ",id1,id2 else: # a non-hit maxscore_of_dist[NONMEMBER_DIST] = max(maxscore_of_dist[NONMEMBER_DIST], d['sprime']) dists = maxscore_of_dist.keys() dists.sort() with open(blast_dirname+'/'+filename+'.phylodist_to_score','w') as f: for k in dists: f.write("{0}\t{1}\n".format(k, maxscore_of_dist[k]))
def evaluate_blast_graph_by_phylo(graph_or_filename,phylo_filename,ignore_prefix=['shuffled_','random_']): """ Reads the blast-processed XGraph (or the filename of the pickled XGraph) The REAL ncRNA seq IDs should be in format <family>_<DB id> The FAKE seq IDs should be in format <ignore_prefix>xxxxx..... Returns (result,sens_by_family,spec_by_family) where result is an obsolete junk (so just make it None) for now phylo_sum_by_family is dict with family --> list of sums of neighbor phylo distances for all nodes in this family spec_by_family is dict with family --> list of spec(ratio) for all nodes in this family """ if type(graph_or_filename) is XGraph: X = graph_or_filename else: X = read_gpickle(graph_or_filename) p = NewickReader(phylo_filename) phylo_sum_by_family = defaultdict(lambda: []) spec_by_family = defaultdict(lambda: []) total_size_by_family = defaultdict(lambda: 0) for n in X.nodes_iter(): i = n.rfind('_') family,db_id = n[:i],n[(i+1):] if any(map(lambda x: family.startswith(x), ignore_prefix)): continue total_size_by_family[family] += 1 if X.degree(n) == 0: spec_by_family[family].append(0) phylo_sum_by_family[family].append(0) continue phylo_sum,tp = 0,0 for m in X.neighbors_iter(n): if any(map(lambda x: family.startswith(x), ignore_prefix)): continue m1 = parse_id(m,True) if m1[0] == family: try: phylo_sum += p.distance(db_id,m1[1]) except: pass tp += 1 phylo_sum_by_family[family].append(phylo_sum) spec_by_family[family].append(tp*1./X.degree(n)) return (None,phylo_sum_by_family,spec_by_family)