def col_consensus(col, prev_col=[], prev_char=[]): # Count the amino acid types in this column aa_counts = sequtils.aa_frequencies(col) assert aa_counts, "Column is all gaps! That's not allowed." # Take the most common residue(s) best_char, best_score = max(aa_counts.iteritems(), key=lambda kv: kv[1]) # Resolve ties ties = [aa for aa in aa_counts if aa_counts[aa] == best_score] if len(ties) > 1: # Breaker #1: most common after the prev. consensus char # Resolve a tied col by restricting to rows where the preceding # char is the consensus type for that (preceding) col if prev_char and prev_col: mc_next = Counter( [b for a, b in zip(prev_col, col) if a == prev_char[0] and b in ties] ).most_common() ties_next = [x[0] for x in mc_next if x[1] == mc_next[0][1]] if ties_next: ties = ties_next if len(ties) > 1: # Breaker #2: lowest overall residue frequency ties.sort(key=lambda aa: bg_freqs[aa]) best_char = ties[0] else: assert best_char == ties[0], \ 'WTF %s != %s[0]' % (best_char, ties) # Save values for tie-breaker #1 prev_col[:] = col prev_char[:] = best_char return best_char
def col_consensus(col): col_freqs = sequtils.aa_frequencies(col) entroper = entropy_func(col_freqs, bg_freqs) try: return max(col_freqs.keys(), key=entroper) except ValueError: # this is probably caused by gaps # TODO: modify this to make it more easier to debug return '-'
def col_consensus(col): col_freqs = sequtils.aa_frequencies(col) entroper = entropy_func(col_freqs, bg_freqs) return max(col_freqs.keys(), key=entroper)