def _sort_cogs(cogs1, cogs2): cogs1 = cogs1[1] # discard seed info cogs2 = cogs2[1] # discard seed info cog_sizes1 = [len(cog) for cog in cogs1] cog_sizes2 = [len(cog) for cog in cogs2] mx1, mn1, avg1 = _max(cog_sizes1), _min(cog_sizes1), round(_mean(cog_sizes1)) mx2, mn2, avg2 = _max(cog_sizes2), _min(cog_sizes2), round(_mean(cog_sizes2)) # we want to maximize all these values in the following order: for i, j in ((mx1, mx2), (avg1, avg2), (len(cogs1), len(cogs2))): v = -1 * cmp(i, j) if v != 0: break return v
def _sort_cogs(cogs1, cogs2): cogs1 = cogs1[1] # discard seed info cogs2 = cogs2[1] # discard seed info cog_sizes1 = [len(cog) for cog in cogs1] cog_sizes2 = [len(cog) for cog in cogs2] mx1, mn1, avg1 = _max(cog_sizes1), _min(cog_sizes1), round( _mean(cog_sizes1)) mx2, mn2, avg2 = _max(cog_sizes2), _min(cog_sizes2), round( _mean(cog_sizes2)) # we want to maximize all these values in the following order: for i, j in ((mx1, mx2), (avg1, avg2), (len(cogs1), len(cogs2))): v = -1 * cmp(i, j) if v != 0: break return v
def get_identity(fname): s = SeqGroup(fname) seqlen = len(s.id2seq.itervalues().next()) ident = list() for i in xrange(seqlen): states = defaultdict(int) for seq in s.id2seq.itervalues(): if seq[i] != "-": states[seq[i]] += 1 values = states.values() if values: ident.append(float(max(values)) / sum(values)) return (_max(ident), _min(ident), _mean(ident), _std(ident))
def get_seqs_identity(alg, seqs): """ Returns alg statistics regarding a set of sequences""" seqlen = len(alg.get_seq(seqs[0])) ident = list() for i in xrange(seqlen): states = defaultdict(int) for seq_id in seqs: seq = alg.get_seq(seq_id) if seq[i] != "-": states[seq[i]] += 1 values = states.values() if values: ident.append(float(max(values)) / sum(values)) return (_max(ident), _min(ident), _mean(ident), _std(ident))
def get_best_selection(cogs_selections, species): ALL_SPECIES = set(species) def _compare_cog_selection(cs1, cs2): seed_1, missing_sp_allowed_1, candidates_1, sp2hits_1 = cs1 seed_2, missing_sp_allowed_2, candidates_2, sp2hits_2 = cs2 score_1, min_cov_1, max_cov_1, median_cov_1, cov_std_1, cog_cov_1 = get_cog_score( candidates_1, sp2hits_1, median_cogs, ALL_SPECIES - set([seed_1])) score_2, min_cov_2, max_cov_2, median_cov_2, cov_std_2, cog_cov_2 = get_cog_score( candidates_2, sp2hits_2, median_cogs, ALL_SPECIES - set([seed_2])) sp_represented_1 = len(sp2hits_1) sp_represented_2 = len(sp2hits_1) cmp_rpr = cmp(sp_represented_1, sp_represented_2) if cmp_rpr == 1: return 1 elif cmp_rpr == -1: return -1 else: cmp_score = cmp(score_1, score_2) if cmp_score == 1: return 1 elif cmp_score == -1: return -1 else: cmp_mincov = cmp(min_cov_1, min_cov_2) if cmp_mincov == 1: return 1 elif cmp_mincov == -1: return -1 else: cmp_maxcov = cmp(max_cov_1, max_cov_2) if cmp_maxcov == 1: return 1 elif cmp_maxcov == -1: return -1 else: cmp_cand = cmp(len(candidates_1), len(candidates_2)) if cmp_cand == 1: return 1 elif cmp_cand == -1: return -1 else: return 0 min_score = 0.5 max_cogs = _max([len(data[2]) for data in cogs_selections]) median_cogs = _median([len(data[2]) for data in cogs_selections]) cogs_selections.sort(_compare_cog_selection) cogs_selections.reverse() header = [ 'seed', 'missing sp allowed', 'spcs covered', '#COGs', 'mean sp coverage)', '#COGs for worst sp.', '#COGs for best sp.', 'sp. in COGS(avg)', 'SCORE' ] print_header = True best_cog_selection = None cog_analysis = StringIO() for i, cogs in enumerate(cogs_selections): seed, missing_sp_allowed, candidates, sp2hits = cogs sp_percent_coverages = [ (100 * sp2hits.get(sp, 0)) / float(len(candidates)) for sp in species ] sp_coverages = [sp2hits.get(sp, 0) for sp in species] score, min_cov, max_cov, median_cov, cov_std, cog_cov = get_cog_score( candidates, sp2hits, median_cogs, ALL_SPECIES - set([seed])) if best_cog_selection is None: best_cog_selection = i flag = "*" else: flag = " " data = (candidates, flag+"%10s" %seed, \ missing_sp_allowed, \ "%d (%0.1f%%)" %(len(set(sp2hits.keys()))+1, 100*float(len(ALL_SPECIES))/(len(sp2hits)+1)) , \ len(candidates), \ "%0.1f%% +- %0.1f" %(_mean(sp_percent_coverages), _std(sp_percent_coverages)), \ "% 3d (%0.1f%%)" %(min(sp_coverages),100*min(sp_coverages)/float(len(candidates))), \ "% 3d (%0.1f%%)" %(max(sp_coverages),100*max(sp_coverages)/float(len(candidates))), \ cog_cov, score ) if print_header: print_as_table([data[1:]], header=header, print_header=True, stdout=cog_analysis) print_header = False else: print_as_table([data[1:]], header=header, print_header=False, stdout=cog_analysis) #raw_input("Press") print cog_analysis.getvalue() #best_cog_selection = int(raw_input("choose:")) return cogs_selections[best_cog_selection], cog_analysis
def brh_cogs2(DB, species, missing_factor=0.0, seed_sp=None, min_score=0): """It scans all precalculate BRH relationships among the species passed as an argument, and detects Clusters of Orthologs according to several criteria: min_score: the min coverage/overalp value required for a blast to be a reliable hit. missing_factor: the min percentage of species in which a given seq must have orthologs. """ def _sort_cogs(cogs1, cogs2): seed1, mx1, avg1, ncogs1 = cogs1 seed2, mx2, avg2, ncogs2 = cogs2 for i, j in ((mx1, mx2), (avg1, avg2), (ncogs1, ncogs2)): v = -1 * cmp(i, j) if v != 0: break return v log.log(26, "Searching BRH orthologs") species = set(map(str, species)) min_species = len(species) - round(missing_factor * len(species)) if seed_sp == "auto": sp_to_test = list(species) elif seed_sp == "largest": cmd = """SELECT taxid, size FROM species""" db.seqcursor.execute(cmd) sp2size = {} for tax, counter in db.seqcursor.fetchall(): if tax in species: sp2size[tax] = counter sorted_sp = sorted(sp2size.items(), lambda x, y: cmp(x[1], y[1])) log.log(24, sorted_sp[:6]) largest_sp = sorted_sp[-1][0] sp_to_test = [largest_sp] log.log(28, "Using %s as search seed. Proteome size=%s genes" %\ (largest_sp, sp2size[largest_sp])) else: sp_to_test = [str(seed_sp)] analysis_txt = StringIO() if sp_to_test: log.log(26, "Finding best COG selection...") seed2size = get_sorted_seeds(seed_sp, species, sp_to_test, min_species, DB) size_analysis = [] for seedname, content in seed2size.iteritems(): cog_sizes = [size for seq, size in content] mx, avg = _max(cog_sizes), round(_mean(cog_sizes)) size_analysis.append([seedname, mx, avg, len(content)]) size_analysis.sort(_sort_cogs) #print '\n'.join(map(str, size_analysis)) seed = size_analysis[0][0] print_as_table( size_analysis[:25], stdout=analysis_txt, header=["Seed", "largest COG", "avg COG size", "total COGs"]) if size_analysis[0][1] < len(species) - 1: print size_analysis[0][1] raise ValueError( "Current COG selection parameters do not permit to cover all species" ) log.log(28, analysis_txt.getvalue()) # The following loop tests each possible seed if none is # specified. log.log(28, "Computing Clusters of Orthologs groups (COGs)") log.log(28, "Min number of species per COG: %d" % min_species) cogs_selection = [] log.log(26, "Using seed species:%s", seed) species_side1 = ','.join( map(quote, [s for s in species if str(s) > str(seed)])) species_side2 = ','.join( map(quote, [s for s in species if str(s) < str(seed)])) pairs1 = [] pairs2 = [] # Select all ids with matches in the target species, and # return the total number of species covered by each of # such ids. if species_side1 != "": cmd = """SELECT seqid1, taxid1, seqid2, taxid2 from ortho_pair WHERE taxid1="%s" AND taxid2 IN (%s) """ % (seed, species_side1) DB.orthocursor.execute(cmd) pairs1 = DB.orthocursor.fetchall() if species_side2 != "": cmd = """SELECT seqid2, taxid2, seqid1, taxid1 from ortho_pair WHERE taxid1 IN (%s) AND taxid2 = "%s" """ % (species_side2, seed) DB.orthocursor.execute(cmd) pairs2 = DB.orthocursor.fetchall() cog_candidates = defaultdict(set) for seq1, sp1, seq2, sp2 in pairs1 + pairs2: s1 = (sp1, seq1) s2 = (sp2, seq2) cog_candidates[(sp1, seq1)].update([s1, s2]) all_cogs = [ cand for cand in cog_candidates.values() if len(cand) >= min_species ] # CHECK CONSISTENCY seqs = set() for cand in all_cogs: seqs.update([b for a, b in cand if a == seed]) pre_selected_seqs = set([v[0] for v in seed2size[seed]]) if len(seqs & pre_selected_seqs) != len(set(seed2size[seed])) or\ len(seqs & pre_selected_seqs) != len(seqs): print "old method seqs", len(seqs), "new seqs", len( set(seed2size[seed])), "Common", len(seqs & pre_selected_seqs) raise ValueError("ooops") cog_sizes = [len(cog) for cog in all_cogs] cog_spsizes = [len(set([e[0] for e in cog])) for cog in all_cogs] if [1 for i in xrange(len(cog_sizes)) if cog_sizes[i] != cog_spsizes[i]]: raise ValueError("Inconsistent COG found") if cog_sizes: cogs_selection.append([seed, all_cogs]) log.log(26, "Found %d COGs" % len(all_cogs)) recoded_cogs = [] for cog in all_cogs: named_cog = map( lambda x: "%s%s%s" % (x[0], GLOBALS["spname_delimiter"], x[1]), cog) recoded_cogs.append(named_cog) return recoded_cogs, analysis_txt.getvalue()
def get_best_selection(cogs_selections, species): ALL_SPECIES = set(species) def _compare_cog_selection(cs1, cs2): seed_1, missing_sp_allowed_1, candidates_1, sp2hits_1 = cs1 seed_2, missing_sp_allowed_2, candidates_2, sp2hits_2 = cs2 score_1, min_cov_1, max_cov_1, median_cov_1, cov_std_1, cog_cov_1 = get_cog_score(candidates_1, sp2hits_1, median_cogs, ALL_SPECIES-set([seed_1])) score_2, min_cov_2, max_cov_2, median_cov_2, cov_std_2, cog_cov_2 = get_cog_score(candidates_2, sp2hits_2, median_cogs, ALL_SPECIES-set([seed_2])) sp_represented_1 = len(sp2hits_1) sp_represented_2 = len(sp2hits_1) cmp_rpr = cmp(sp_represented_1, sp_represented_2) if cmp_rpr == 1: return 1 elif cmp_rpr == -1: return -1 else: cmp_score = cmp(score_1, score_2) if cmp_score == 1: return 1 elif cmp_score == -1: return -1 else: cmp_mincov = cmp(min_cov_1, min_cov_2) if cmp_mincov == 1: return 1 elif cmp_mincov == -1: return -1 else: cmp_maxcov = cmp(max_cov_1, max_cov_2) if cmp_maxcov == 1: return 1 elif cmp_maxcov == -1: return -1 else: cmp_cand = cmp(len(candidates_1), len(candidates_2)) if cmp_cand == 1: return 1 elif cmp_cand == -1: return -1 else: return 0 min_score = 0.5 max_cogs = _max([len(data[2]) for data in cogs_selections]) median_cogs = _median([len(data[2]) for data in cogs_selections]) cogs_selections.sort(_compare_cog_selection) cogs_selections.reverse() header = ['seed', 'missing sp allowed', 'spcs covered', '#COGs', 'mean sp coverage)', '#COGs for worst sp.', '#COGs for best sp.', 'sp. in COGS(avg)', 'SCORE' ] print_header = True best_cog_selection = None cog_analysis = StringIO() for i, cogs in enumerate(cogs_selections): seed, missing_sp_allowed, candidates, sp2hits = cogs sp_percent_coverages = [(100*sp2hits.get(sp,0))/float(len(candidates)) for sp in species] sp_coverages = [sp2hits.get(sp, 0) for sp in species] score, min_cov, max_cov, median_cov, cov_std, cog_cov = get_cog_score(candidates, sp2hits, median_cogs, ALL_SPECIES-set([seed])) if best_cog_selection is None: best_cog_selection = i flag = "*" else: flag = " " data = (candidates, flag+"%10s" %seed, \ missing_sp_allowed, \ "%d (%0.1f%%)" %(len(set(sp2hits.keys()))+1, 100*float(len(ALL_SPECIES))/(len(sp2hits)+1)) , \ len(candidates), \ "%0.1f%% +- %0.1f" %(_mean(sp_percent_coverages), _std(sp_percent_coverages)), \ "% 3d (%0.1f%%)" %(min(sp_coverages),100*min(sp_coverages)/float(len(candidates))), \ "% 3d (%0.1f%%)" %(max(sp_coverages),100*max(sp_coverages)/float(len(candidates))), \ cog_cov, score ) if print_header: print_as_table([data[1:]], header=header, print_header=True, stdout=cog_analysis) print_header = False else: print_as_table([data[1:]], header=header, print_header=False, stdout=cog_analysis) #raw_input("Press") print cog_analysis.getvalue() #best_cog_selection = int(raw_input("choose:")) return cogs_selections[best_cog_selection], cog_analysis
def brh_cogs2(DB, species, missing_factor=0.0, seed_sp=None, min_score=0): """It scans all precalculate BRH relationships among the species passed as an argument, and detects Clusters of Orthologs according to several criteria: min_score: the min coverage/overalp value required for a blast to be a reliable hit. missing_factor: the min percentage of species in which a given seq must have orthologs. """ def _sort_cogs(cogs1, cogs2): seed1, mx1, avg1, ncogs1 = cogs1 seed2, mx2, avg2, ncogs2 = cogs2 for i, j in ((mx1, mx2), (avg1, avg2), (ncogs1, ncogs2)): v = -1 * cmp(i, j) if v != 0: break return v log.log(26, "Searching BRH orthologs") species = set(map(str, species)) min_species = len(species) - round(missing_factor * len(species)) if seed_sp == "auto": sp_to_test = list(species) elif seed_sp == "largest": cmd = """SELECT taxid, size FROM species""" db.seqcursor.execute(cmd) sp2size = {} for tax, counter in db.seqcursor.fetchall(): if tax in species: sp2size[tax] = counter sorted_sp = sorted(sp2size.items(), lambda x,y: cmp(x[1],y[1])) log.log(24, sorted_sp[:6]) largest_sp = sorted_sp[-1][0] sp_to_test = [largest_sp] log.log(28, "Using %s as search seed. Proteome size=%s genes" %\ (largest_sp, sp2size[largest_sp])) else: sp_to_test = [str(seed_sp)] analysis_txt = StringIO() if sp_to_test: log.log(26, "Finding best COG selection...") seed2size = get_sorted_seeds(seed_sp, species, sp_to_test, min_species, DB) size_analysis = [] for seedname, content in seed2size.iteritems(): cog_sizes = [size for seq, size in content] mx, avg = _max(cog_sizes), round(_mean(cog_sizes)) size_analysis.append([seedname, mx, avg, len(content)]) size_analysis.sort(_sort_cogs) #print '\n'.join(map(str, size_analysis)) seed = size_analysis[0][0] print_as_table(size_analysis[:25], stdout=analysis_txt, header=["Seed","largest COG", "avg COG size", "total COGs"]) if size_analysis[0][1] < len(species)-1: print size_analysis[0][1] raise ValueError("Current COG selection parameters do not permit to cover all species") log.log(28, analysis_txt.getvalue()) # The following loop tests each possible seed if none is # specified. log.log(28, "Computing Clusters of Orthologs groups (COGs)") log.log(28, "Min number of species per COG: %d" %min_species) cogs_selection = [] log.log(26,"Using seed species:%s", seed) species_side1 = ','.join(map(quote, [s for s in species if str(s)>str(seed)])) species_side2 = ','.join(map(quote, [s for s in species if str(s)<str(seed)])) pairs1 = [] pairs2 = [] # Select all ids with matches in the target species, and # return the total number of species covered by each of # such ids. if species_side1 != "": cmd = """SELECT seqid1, taxid1, seqid2, taxid2 from ortho_pair WHERE taxid1="%s" AND taxid2 IN (%s) """ % (seed, species_side1) DB.orthocursor.execute(cmd) pairs1 = DB.orthocursor.fetchall() if species_side2 != "": cmd = """SELECT seqid2, taxid2, seqid1, taxid1 from ortho_pair WHERE taxid1 IN (%s) AND taxid2 = "%s" """ % (species_side2, seed) DB.orthocursor.execute(cmd) pairs2 = DB.orthocursor.fetchall() cog_candidates = defaultdict(set) for seq1, sp1, seq2, sp2 in pairs1 + pairs2: s1 = (sp1, seq1) s2 = (sp2, seq2) cog_candidates[(sp1, seq1)].update([s1, s2]) all_cogs = [cand for cand in cog_candidates.values() if len(cand) >= min_species] # CHECK CONSISTENCY seqs = set() for cand in all_cogs: seqs.update([b for a,b in cand if a == seed]) pre_selected_seqs = set([v[0] for v in seed2size[seed]]) if len(seqs & pre_selected_seqs) != len(set(seed2size[seed])) or\ len(seqs & pre_selected_seqs) != len(seqs): print "old method seqs", len(seqs), "new seqs", len(set(seed2size[seed])), "Common", len(seqs & pre_selected_seqs) raise ValueError("ooops") cog_sizes = [len(cog) for cog in all_cogs] cog_spsizes = [len(set([e[0] for e in cog])) for cog in all_cogs] if [1 for i in xrange(len(cog_sizes)) if cog_sizes[i] != cog_spsizes[i]]: raise ValueError("Inconsistent COG found") if cog_sizes: cogs_selection.append([seed, all_cogs]) log.log(26, "Found %d COGs" % len(all_cogs)) recoded_cogs = [] for cog in all_cogs: named_cog = map(lambda x: "%s%s%s" %(x[0], GLOBALS["spname_delimiter"],x[1]), cog) recoded_cogs.append(named_cog) return recoded_cogs, analysis_txt.getvalue()