예제 #1
0
파일: core.py 프로젝트: etal/cladecompare
def process_pair(fg_aln, bg_aln, module, do_weight):
    """Calculate a mapping of alignment column positions to "contrast".

    Return a list of tuples:
        (foreground consensus aa, background consensus aa, p-value)
        for each column position.
    """
    fg_aln, bg_aln = clean_alignments(fg_aln, bg_aln)
    if do_weight:
        fg_weights = alnutils.sequence_weights(fg_aln, 'none')
        bg_weights = alnutils.sequence_weights(bg_aln, 'none')
    else:
        fg_weights = [1 for i in range(len(fg_aln))]
        bg_weights = [1 for i in range(len(bg_aln))]
    fg_size = fsum(fg_weights) if module != urn else len(fg_aln)
    bg_size = fsum(bg_weights)
    # Overall aa freqs for pseudocounts
    aa_freqs = combined_frequencies(fg_aln, fg_weights, bg_aln, bg_weights)
    fg_cons = consensus.consensus(fg_aln,
                                  weights=fg_weights,
                                  trim_ends=False,
                                  gap_threshold=GAP_THRESH)
    bg_cons = consensus.consensus(bg_aln,
                                  weights=bg_weights,
                                  trim_ends=False,
                                  gap_threshold=GAP_THRESH)

    hits = []
    for faa, baa, fg_col, bg_col in zip(fg_cons, bg_cons, zip(*fg_aln),
                                        zip(*bg_aln)):
        if faa == '-' or baa == '-':
            # Ignore indel columns -- there are better ways to look at these
            pvalue = 1.
        else:
            pvalue = module.compare_cols(fg_col, faa, fg_size, fg_weights,
                                         bg_col, baa, bg_size, bg_weights,
                                         aa_freqs, PSEUDO_SIZE)
        hits.append((faa, baa, pvalue))

    return fg_aln, bg_aln, hits
예제 #2
0
파일: core.py 프로젝트: zruan/cladecompare
def process_pair(fg_aln, bg_aln, module, nw):
    """Calculate a mapping of alignment column positions to "contrast".

    Return a list of tuples:
        (foreground consensus aa, background consensus aa, p-value)
        for each column position.
    """
    fg_aln, bg_aln = clean_alignments(fg_aln, bg_aln)
    if nw:
        fg_weights = list(1 for i in range(len(fg_aln)))
        bg_weights = list(1 for i in range(len(bg_aln)))
    else:
        fg_weights = alnutils.sequence_weights(fg_aln, 'none')
        bg_weights = alnutils.sequence_weights(bg_aln, 'none')
    fg_size = sum(fg_weights) if module != urn else len(fg_aln)
    bg_size = sum(bg_weights)
    # Overall aa freqs for pseudocounts
    aa_freqs = combined_frequencies(fg_aln, fg_weights, bg_aln, bg_weights)
    fg_cons = consensus.consensus(fg_aln, weights=fg_weights, trim_ends=False,
                                  gap_threshold=GAP_THRESH)
    bg_cons = consensus.consensus(bg_aln, weights=bg_weights, trim_ends=False,
                                  gap_threshold=GAP_THRESH)

    hits = []
    for faa, baa, fg_col, bg_col in zip(fg_cons, bg_cons,
                                        zip(*fg_aln), zip(*bg_aln)):
        if faa == '-' or baa == '-':
            # Ignore indel columns -- there are better ways to look at these
            pvalue = 1.
        else:
            pvalue = module.compare_cols(
                fg_col, faa, fg_size, fg_weights,
                bg_col, baa, bg_size, bg_weights,
                aa_freqs, PSEUDO_SIZE)
        hits.append((faa, baa, pvalue))

    return fg_aln, bg_aln, hits
예제 #3
0
def compare_aln(fg_aln, bg_aln):
    """Compare alignments using the ball-in-urn model.

    Like CHAIN does.
    """
    # BG seqs are weighted, FG seqs are not
    bg_weights = alnutils.sequence_weights(bg_aln, 'none')
    bg_size = sum(bg_weights)
    bg_cons = consensus.consensus(bg_aln, weights=bg_weights)
    # Height of the foreground alignment column
    fg_size = len(fg_aln)
    fg_cons = consensus.consensus(fg_aln)
    fg_cols = zip(*fg_aln)
    bg_cols = zip(*bg_aln)
    fg_weights = [1]*fg_size
    pseudocounts = combined_frequencies(fg_aln, fg_weights, bg_aln, bg_weights)
    hits = []
    for faa, baa, fg_col, bg_col in zip(fg_cons, bg_cons, fg_cols, bg_cols):
        if faa == '-' or baa == '-':
            # Ignore indel columns -- there are better ways to look at these
            pvalue = 1.0
        else:
            # Cumulative binomial test
            # Number of consensus-type residues in the foreground column
            fg_counts = count_col(fg_col, fg_weights, pseudocounts)
            fg_tot = fg_counts['S'] + fg_counts['T'] + fg_counts['Y']
            # Consensus residue frequency in the combined alignment column
            bg_counts = count_col(bg_col, bg_weights, pseudocounts)
            p_j = (bg_counts['S'] + bg_counts['T'] + bg_counts['Y'] + fg_tot
                  ) / (bg_size + fg_size + 2.0) # pseudocount size = 1.0

            # Probability of fg col conservation vs. the combined/main set
            # (P_j_LB in the publication)
            # NB: Some tweaks for pseudocounts
            pvalue = binom.pmf(range(int(math.ceil(fg_tot)), fg_size+2),
                               fg_size+1, p_j).sum()
            if pvalue == 1.0:
                logging.info("Meaningless p-value: p_j=%s, fg=%s vs. bg=%s",
                             p_j, fg_tot, bg_counts)
        hits.append((faa, baa, pvalue))
    return hits
예제 #4
0
def compare_aln(fg_aln, bg_aln):
    """Compare alignments using the ball-in-urn model.

    Like CHAIN does.
    """
    # BG seqs are weighted, FG seqs are not
    bg_weights = alnutils.sequence_weights(bg_aln, 'none')
    bg_size = sum(bg_weights)
    bg_cons = consensus.consensus(bg_aln, weights=bg_weights)
    # Height of the foreground alignment column
    fg_size = len(fg_aln)
    fg_cons = consensus.consensus(fg_aln)
    fg_cols = zip(*fg_aln)
    bg_cols = zip(*bg_aln)
    fg_weights = [1] * fg_size
    pseudocounts = combined_frequencies(fg_aln, fg_weights, bg_aln, bg_weights)
    hits = []
    for faa, baa, fg_col, bg_col in zip(fg_cons, bg_cons, fg_cols, bg_cols):
        if faa == '-' or baa == '-':
            # Ignore indel columns -- there are better ways to look at these
            pvalue = 1.0
        else:
            # Cumulative binomial test
            # Number of consensus-type residues in the foreground column
            fg_counts = count_col(fg_col, fg_weights, pseudocounts)
            fg_tot = fg_counts['S'] + fg_counts['T'] + fg_counts['Y']
            # Consensus residue frequency in the combined alignment column
            bg_counts = count_col(bg_col, bg_weights, pseudocounts)
            p_j = (bg_counts['S'] + bg_counts['T'] + bg_counts['Y'] + fg_tot
                   ) / (bg_size + fg_size + 2.0)  # pseudocount size = 1.0

            # Probability of fg col conservation vs. the combined/main set
            # (P_j_LB in the publication)
            # NB: Some tweaks for pseudocounts
            pvalue = binom.pmf(range(int(math.ceil(fg_tot)), fg_size + 2),
                               fg_size + 1, p_j).sum()
            if pvalue == 1.0:
                logging.info("Meaningless p-value: p_j=%s, fg=%s vs. bg=%s",
                             p_j, fg_tot, bg_counts)
        hits.append((faa, baa, pvalue))
    return hits
예제 #5
0
파일: core.py 프로젝트: zruan/cladecompare
def process_one(aln, module, nw):
    """Calculate a mapping of alignment column positions to "contrast"."""
    if nw:
        weights = list(1 for i in range(len(aln)))
    else:
        weights = alnutils.sequence_weights(aln, 'none')
                                        # if module != jsd else 'sum1')
    aln_size = sum(weights) if module != urn else len(aln)
    aa_freqs = alnutils.aa_frequencies(aln, weights, gap_chars='-.X')
    cons = consensus.consensus(aln, weights=weights, trim_ends=False,
                               gap_threshold=GAP_THRESH)
    hits = []
    for cons_aa, col in zip(cons, zip(*aln)):
        if cons_aa == '-':
            # Ignore indel columns -- there are better ways to look at these
            pvalue = 1.
        else:
            pvalue = module.compare_one(col, cons_aa, aln_size, weights,
                                        aa_freqs, PSEUDO_SIZE)
        hits.append((cons_aa, '_', pvalue))
    return aln, hits
예제 #6
0
파일: core.py 프로젝트: etal/cladecompare
def process_one(aln, module, do_weight):
    """Calculate a mapping of alignment column positions to "contrast"."""
    if do_weight:
        weights = alnutils.sequence_weights(aln, 'none')
        # if module != jsd else 'sum1')
    else:
        weights = [1 for i in range(len(aln))]
    aln_size = fsum(weights) if module != urn else len(aln)
    aa_freqs = alnutils.aa_frequencies(aln, weights, gap_chars='-.X')
    cons = consensus.consensus(aln,
                               weights=weights,
                               trim_ends=False,
                               gap_threshold=GAP_THRESH)
    hits = []
    for cons_aa, col in zip(cons, zip(*aln)):
        if cons_aa == '-':
            # Ignore indel columns -- there are better ways to look at these
            pvalue = 1.
        else:
            pvalue = module.compare_one(col, cons_aa, aln_size, weights,
                                        aa_freqs, PSEUDO_SIZE)
        hits.append((cons_aa, '_', pvalue))
    return aln, hits
예제 #7
0
파일: consensus.py 프로젝트: etal/biofrills
def consensus(aln, weights=None, gap_threshold=0.5, simple=False, trim_ends=True):
    """Get the consensus of an alignment, as a string.

    Emit gap characters for majority-gap columns; apply various strategies to
    choose the consensus amino acid type for the remaining columns.

    Parameters
    ----------

    simple : bool
        If True, use simple plurality to determine the consensus amino acid
        type, without weighting sequences for similarity. Otherwise, weight
        sequences for similarity and use relative entropy to choose the
        consensus amino acid type.
    weights : dict or None
        Sequence weights. If given, used to calculate amino acid frequencies;
        otherwise calculated within this function (i.e. this is a way to speed
        up the function if sequence weights have already been calculated).
        Ignored in 'simple' mode.
    trim_ends : bool
        If False, stretch the consensus sequence to include the N- and C-tails
        of the alignment, even if those flanking columns are mostly gap
        characters. This avoids terminal gaps in the consensus (needed for
        MAPGAPS).
    gap_threshold : float
        If the proportion of gap characters in a column is greater than or equal
        to this value (after sequence weighting, if applicable), then the
        consensus character emitted will be a gap instead of an amino acid type.

    """
    # Choose your algorithms!
    if simple:
        # Use the simple, unweighted algorithm
        col_consensus = make_simple_col_consensus(alnutils.aa_frequencies(aln))
        def is_majority_gap(col):
            return (float(col.count('-')) / len(col) >= gap_threshold)
        # ENH (alternatively/additionally): does any aa occur more than once?
        # ENH: choose gap-decisionmaking separately from col_consensus
    else:
        # Use the entropy-based, weighted algorithm
        if weights is None:
            seq_weights = alnutils.sequence_weights(aln, 'avg1')
        else:
            seq_weights = weights
        aa_frequencies = alnutils.aa_frequencies(aln, weights=seq_weights)
        col_consensus = make_entropy_col_consensus(aa_frequencies)
        def is_majority_gap(col):
            gap_count = 0.0
            for wt, char in zip(seq_weights, col):
                if char == '-':
                    gap_count += wt
            return (gap_count / sum(seq_weights) >= gap_threshold)

    # Traverse the alignment, handling gaps etc.
    def col_wise_consensus(columns):
        """Calculate the consensus chars for an iterable of columns."""
        if not trim_ends:
            # Track if we're in the N-term or C-term end of the sequence
            in_left_end = True
            maybe_right_tail = []
        # prev_col = None
        # prev_char = None
        for col in columns:
            # Lowercase cols mean explicitly, "don't include in consensus"
            if all(c.islower() for c in col if c not in '.-'):
                yield '-'
                continue
            if any(c.islower() for c in col):
                logging.warn('Mixed lowercase and uppercase letters in a '
                        'column: ' + ''.join(col))
                col = map(str.upper, col)

            # Gap chars
            is_gap = is_majority_gap(col)
            if not trim_ends:
                # Avoid N-terminal gaps in the consensus sequence
                if in_left_end:
                    if not is_gap:
                        # Match -- we're no longer in the left end
                        in_left_end = False
                    is_gap = False

            # When to yield a gap here:
            #   -----------     ---------   ------  ----------
            #   in_left_end     trim_ends   is_gap  yield gap?
            #   -----------     ---------   ------  ----------
            #   True            True        (True)  yes
            #   True            False       (False) (no -- def. char)
            #   False           True        T/F     yes, if is_gap
            #   False           False       (T/F)   NO! use maybe_right_tail
            #   -----------     ---------   ------  ----------

            if is_gap and trim_ends:
                yield '-'
                continue

            # Get the consensus character, using the chosen algorithm
            cons_char = col_consensus(col)

            if trim_ends:
                yield cons_char
            else:
                # Avoid C-terminal gaps in the consensus sequence
                if is_gap:
                    maybe_right_tail.append(cons_char)
                else:
                    # Match -> gaps weren't the right tail; emit all gaps
                    for char in maybe_right_tail:
                        yield '-'
                    maybe_right_tail = []
                    yield cons_char

            # prev_col = col
            # prev_char = cons_char

        # Finally, if we were keeping a right (C-term) tail, emit it
        if not trim_ends:
            for char in maybe_right_tail:
                yield char

    return ''.join(col_wise_consensus(zip(*aln)))