def process_pair(fg_aln, bg_aln, module, do_weight): """Calculate a mapping of alignment column positions to "contrast". Return a list of tuples: (foreground consensus aa, background consensus aa, p-value) for each column position. """ fg_aln, bg_aln = clean_alignments(fg_aln, bg_aln) if do_weight: fg_weights = alnutils.sequence_weights(fg_aln, 'none') bg_weights = alnutils.sequence_weights(bg_aln, 'none') else: fg_weights = [1 for i in range(len(fg_aln))] bg_weights = [1 for i in range(len(bg_aln))] fg_size = fsum(fg_weights) if module != urn else len(fg_aln) bg_size = fsum(bg_weights) # Overall aa freqs for pseudocounts aa_freqs = combined_frequencies(fg_aln, fg_weights, bg_aln, bg_weights) fg_cons = consensus.consensus(fg_aln, weights=fg_weights, trim_ends=False, gap_threshold=GAP_THRESH) bg_cons = consensus.consensus(bg_aln, weights=bg_weights, trim_ends=False, gap_threshold=GAP_THRESH) hits = [] for faa, baa, fg_col, bg_col in zip(fg_cons, bg_cons, zip(*fg_aln), zip(*bg_aln)): if faa == '-' or baa == '-': # Ignore indel columns -- there are better ways to look at these pvalue = 1. else: pvalue = module.compare_cols(fg_col, faa, fg_size, fg_weights, bg_col, baa, bg_size, bg_weights, aa_freqs, PSEUDO_SIZE) hits.append((faa, baa, pvalue)) return fg_aln, bg_aln, hits
def process_pair(fg_aln, bg_aln, module, nw): """Calculate a mapping of alignment column positions to "contrast". Return a list of tuples: (foreground consensus aa, background consensus aa, p-value) for each column position. """ fg_aln, bg_aln = clean_alignments(fg_aln, bg_aln) if nw: fg_weights = list(1 for i in range(len(fg_aln))) bg_weights = list(1 for i in range(len(bg_aln))) else: fg_weights = alnutils.sequence_weights(fg_aln, 'none') bg_weights = alnutils.sequence_weights(bg_aln, 'none') fg_size = sum(fg_weights) if module != urn else len(fg_aln) bg_size = sum(bg_weights) # Overall aa freqs for pseudocounts aa_freqs = combined_frequencies(fg_aln, fg_weights, bg_aln, bg_weights) fg_cons = consensus.consensus(fg_aln, weights=fg_weights, trim_ends=False, gap_threshold=GAP_THRESH) bg_cons = consensus.consensus(bg_aln, weights=bg_weights, trim_ends=False, gap_threshold=GAP_THRESH) hits = [] for faa, baa, fg_col, bg_col in zip(fg_cons, bg_cons, zip(*fg_aln), zip(*bg_aln)): if faa == '-' or baa == '-': # Ignore indel columns -- there are better ways to look at these pvalue = 1. else: pvalue = module.compare_cols( fg_col, faa, fg_size, fg_weights, bg_col, baa, bg_size, bg_weights, aa_freqs, PSEUDO_SIZE) hits.append((faa, baa, pvalue)) return fg_aln, bg_aln, hits
def compare_aln(fg_aln, bg_aln): """Compare alignments using the ball-in-urn model. Like CHAIN does. """ # BG seqs are weighted, FG seqs are not bg_weights = alnutils.sequence_weights(bg_aln, 'none') bg_size = sum(bg_weights) bg_cons = consensus.consensus(bg_aln, weights=bg_weights) # Height of the foreground alignment column fg_size = len(fg_aln) fg_cons = consensus.consensus(fg_aln) fg_cols = zip(*fg_aln) bg_cols = zip(*bg_aln) fg_weights = [1]*fg_size pseudocounts = combined_frequencies(fg_aln, fg_weights, bg_aln, bg_weights) hits = [] for faa, baa, fg_col, bg_col in zip(fg_cons, bg_cons, fg_cols, bg_cols): if faa == '-' or baa == '-': # Ignore indel columns -- there are better ways to look at these pvalue = 1.0 else: # Cumulative binomial test # Number of consensus-type residues in the foreground column fg_counts = count_col(fg_col, fg_weights, pseudocounts) fg_tot = fg_counts['S'] + fg_counts['T'] + fg_counts['Y'] # Consensus residue frequency in the combined alignment column bg_counts = count_col(bg_col, bg_weights, pseudocounts) p_j = (bg_counts['S'] + bg_counts['T'] + bg_counts['Y'] + fg_tot ) / (bg_size + fg_size + 2.0) # pseudocount size = 1.0 # Probability of fg col conservation vs. the combined/main set # (P_j_LB in the publication) # NB: Some tweaks for pseudocounts pvalue = binom.pmf(range(int(math.ceil(fg_tot)), fg_size+2), fg_size+1, p_j).sum() if pvalue == 1.0: logging.info("Meaningless p-value: p_j=%s, fg=%s vs. bg=%s", p_j, fg_tot, bg_counts) hits.append((faa, baa, pvalue)) return hits
def compare_aln(fg_aln, bg_aln): """Compare alignments using the ball-in-urn model. Like CHAIN does. """ # BG seqs are weighted, FG seqs are not bg_weights = alnutils.sequence_weights(bg_aln, 'none') bg_size = sum(bg_weights) bg_cons = consensus.consensus(bg_aln, weights=bg_weights) # Height of the foreground alignment column fg_size = len(fg_aln) fg_cons = consensus.consensus(fg_aln) fg_cols = zip(*fg_aln) bg_cols = zip(*bg_aln) fg_weights = [1] * fg_size pseudocounts = combined_frequencies(fg_aln, fg_weights, bg_aln, bg_weights) hits = [] for faa, baa, fg_col, bg_col in zip(fg_cons, bg_cons, fg_cols, bg_cols): if faa == '-' or baa == '-': # Ignore indel columns -- there are better ways to look at these pvalue = 1.0 else: # Cumulative binomial test # Number of consensus-type residues in the foreground column fg_counts = count_col(fg_col, fg_weights, pseudocounts) fg_tot = fg_counts['S'] + fg_counts['T'] + fg_counts['Y'] # Consensus residue frequency in the combined alignment column bg_counts = count_col(bg_col, bg_weights, pseudocounts) p_j = (bg_counts['S'] + bg_counts['T'] + bg_counts['Y'] + fg_tot ) / (bg_size + fg_size + 2.0) # pseudocount size = 1.0 # Probability of fg col conservation vs. the combined/main set # (P_j_LB in the publication) # NB: Some tweaks for pseudocounts pvalue = binom.pmf(range(int(math.ceil(fg_tot)), fg_size + 2), fg_size + 1, p_j).sum() if pvalue == 1.0: logging.info("Meaningless p-value: p_j=%s, fg=%s vs. bg=%s", p_j, fg_tot, bg_counts) hits.append((faa, baa, pvalue)) return hits
def process_one(aln, module, nw): """Calculate a mapping of alignment column positions to "contrast".""" if nw: weights = list(1 for i in range(len(aln))) else: weights = alnutils.sequence_weights(aln, 'none') # if module != jsd else 'sum1') aln_size = sum(weights) if module != urn else len(aln) aa_freqs = alnutils.aa_frequencies(aln, weights, gap_chars='-.X') cons = consensus.consensus(aln, weights=weights, trim_ends=False, gap_threshold=GAP_THRESH) hits = [] for cons_aa, col in zip(cons, zip(*aln)): if cons_aa == '-': # Ignore indel columns -- there are better ways to look at these pvalue = 1. else: pvalue = module.compare_one(col, cons_aa, aln_size, weights, aa_freqs, PSEUDO_SIZE) hits.append((cons_aa, '_', pvalue)) return aln, hits
def process_one(aln, module, do_weight): """Calculate a mapping of alignment column positions to "contrast".""" if do_weight: weights = alnutils.sequence_weights(aln, 'none') # if module != jsd else 'sum1') else: weights = [1 for i in range(len(aln))] aln_size = fsum(weights) if module != urn else len(aln) aa_freqs = alnutils.aa_frequencies(aln, weights, gap_chars='-.X') cons = consensus.consensus(aln, weights=weights, trim_ends=False, gap_threshold=GAP_THRESH) hits = [] for cons_aa, col in zip(cons, zip(*aln)): if cons_aa == '-': # Ignore indel columns -- there are better ways to look at these pvalue = 1. else: pvalue = module.compare_one(col, cons_aa, aln_size, weights, aa_freqs, PSEUDO_SIZE) hits.append((cons_aa, '_', pvalue)) return aln, hits
def consensus(aln, weights=None, gap_threshold=0.5, simple=False, trim_ends=True): """Get the consensus of an alignment, as a string. Emit gap characters for majority-gap columns; apply various strategies to choose the consensus amino acid type for the remaining columns. Parameters ---------- simple : bool If True, use simple plurality to determine the consensus amino acid type, without weighting sequences for similarity. Otherwise, weight sequences for similarity and use relative entropy to choose the consensus amino acid type. weights : dict or None Sequence weights. If given, used to calculate amino acid frequencies; otherwise calculated within this function (i.e. this is a way to speed up the function if sequence weights have already been calculated). Ignored in 'simple' mode. trim_ends : bool If False, stretch the consensus sequence to include the N- and C-tails of the alignment, even if those flanking columns are mostly gap characters. This avoids terminal gaps in the consensus (needed for MAPGAPS). gap_threshold : float If the proportion of gap characters in a column is greater than or equal to this value (after sequence weighting, if applicable), then the consensus character emitted will be a gap instead of an amino acid type. """ # Choose your algorithms! if simple: # Use the simple, unweighted algorithm col_consensus = make_simple_col_consensus(alnutils.aa_frequencies(aln)) def is_majority_gap(col): return (float(col.count('-')) / len(col) >= gap_threshold) # ENH (alternatively/additionally): does any aa occur more than once? # ENH: choose gap-decisionmaking separately from col_consensus else: # Use the entropy-based, weighted algorithm if weights is None: seq_weights = alnutils.sequence_weights(aln, 'avg1') else: seq_weights = weights aa_frequencies = alnutils.aa_frequencies(aln, weights=seq_weights) col_consensus = make_entropy_col_consensus(aa_frequencies) def is_majority_gap(col): gap_count = 0.0 for wt, char in zip(seq_weights, col): if char == '-': gap_count += wt return (gap_count / sum(seq_weights) >= gap_threshold) # Traverse the alignment, handling gaps etc. def col_wise_consensus(columns): """Calculate the consensus chars for an iterable of columns.""" if not trim_ends: # Track if we're in the N-term or C-term end of the sequence in_left_end = True maybe_right_tail = [] # prev_col = None # prev_char = None for col in columns: # Lowercase cols mean explicitly, "don't include in consensus" if all(c.islower() for c in col if c not in '.-'): yield '-' continue if any(c.islower() for c in col): logging.warn('Mixed lowercase and uppercase letters in a ' 'column: ' + ''.join(col)) col = map(str.upper, col) # Gap chars is_gap = is_majority_gap(col) if not trim_ends: # Avoid N-terminal gaps in the consensus sequence if in_left_end: if not is_gap: # Match -- we're no longer in the left end in_left_end = False is_gap = False # When to yield a gap here: # ----------- --------- ------ ---------- # in_left_end trim_ends is_gap yield gap? # ----------- --------- ------ ---------- # True True (True) yes # True False (False) (no -- def. char) # False True T/F yes, if is_gap # False False (T/F) NO! use maybe_right_tail # ----------- --------- ------ ---------- if is_gap and trim_ends: yield '-' continue # Get the consensus character, using the chosen algorithm cons_char = col_consensus(col) if trim_ends: yield cons_char else: # Avoid C-terminal gaps in the consensus sequence if is_gap: maybe_right_tail.append(cons_char) else: # Match -> gaps weren't the right tail; emit all gaps for char in maybe_right_tail: yield '-' maybe_right_tail = [] yield cons_char # prev_col = col # prev_char = cons_char # Finally, if we were keeping a right (C-term) tail, emit it if not trim_ends: for char in maybe_right_tail: yield char return ''.join(col_wise_consensus(zip(*aln)))