예제 #1
0
def compare_aln(fg_aln, bg_aln):
    """Compare alignments using the ball-in-urn model.

    Like CHAIN does.
    """
    # BG seqs are weighted, FG seqs are not
    bg_weights = alnutils.sequence_weights(bg_aln, 'none')
    bg_size = sum(bg_weights)
    bg_cons = consensus.consensus(bg_aln, weights=bg_weights)
    # Height of the foreground alignment column
    fg_size = len(fg_aln)
    fg_cons = consensus.consensus(fg_aln)
    fg_cols = zip(*fg_aln)
    bg_cols = zip(*bg_aln)
    fg_weights = [1] * fg_size
    pseudocounts = combined_frequencies(fg_aln, fg_weights, bg_aln, bg_weights)
    hits = []
    for faa, baa, fg_col, bg_col in zip(fg_cons, bg_cons, fg_cols, bg_cols):
        if faa == '-' or baa == '-':
            # Ignore indel columns -- there are better ways to look at these
            pvalue = 1.0
        else:
            # Cumulative binomial test
            # Number of consensus-type residues in the foreground column
            fg_counts = count_col(fg_col, fg_weights, pseudocounts)
            fg_tot = fg_counts['S'] + fg_counts['T'] + fg_counts['Y']
            # Consensus residue frequency in the combined alignment column
            bg_counts = count_col(bg_col, bg_weights, pseudocounts)
            p_j = (bg_counts['S'] + bg_counts['T'] + bg_counts['Y'] + fg_tot
                   ) / (bg_size + fg_size + 2.0)  # pseudocount size = 1.0

            # Probability of fg col conservation vs. the combined/main set
            # (P_j_LB in the publication)
            # NB: Some tweaks for pseudocounts
            pvalue = binom.pmf(range(int(math.ceil(fg_tot)), fg_size + 2),
                               fg_size + 1, p_j).sum()
            if pvalue == 1.0:
                logging.info("Meaningless p-value: p_j=%s, fg=%s vs. bg=%s",
                             p_j, fg_tot, bg_counts)
        hits.append((faa, baa, pvalue))
    return hits
예제 #2
0
def compare_aln(fg_aln, bg_aln):
    """Compare alignments using the ball-in-urn model.

    Like CHAIN does.
    """
    # BG seqs are weighted, FG seqs are not
    bg_weights = alnutils.sequence_weights(bg_aln, 'none')
    bg_size = sum(bg_weights)
    bg_cons = consensus.consensus(bg_aln, weights=bg_weights)
    # Height of the foreground alignment column
    fg_size = len(fg_aln)
    fg_cons = consensus.consensus(fg_aln)
    fg_cols = zip(*fg_aln)
    bg_cols = zip(*bg_aln)
    fg_weights = [1]*fg_size
    pseudocounts = combined_frequencies(fg_aln, fg_weights, bg_aln, bg_weights)
    hits = []
    for faa, baa, fg_col, bg_col in zip(fg_cons, bg_cons, fg_cols, bg_cols):
        if faa == '-' or baa == '-':
            # Ignore indel columns -- there are better ways to look at these
            pvalue = 1.0
        else:
            # Cumulative binomial test
            # Number of consensus-type residues in the foreground column
            fg_counts = count_col(fg_col, fg_weights, pseudocounts)
            fg_tot = fg_counts['S'] + fg_counts['T'] + fg_counts['Y']
            # Consensus residue frequency in the combined alignment column
            bg_counts = count_col(bg_col, bg_weights, pseudocounts)
            p_j = (bg_counts['S'] + bg_counts['T'] + bg_counts['Y'] + fg_tot
                  ) / (bg_size + fg_size + 2.0) # pseudocount size = 1.0

            # Probability of fg col conservation vs. the combined/main set
            # (P_j_LB in the publication)
            # NB: Some tweaks for pseudocounts
            pvalue = binom.pmf(range(int(math.ceil(fg_tot)), fg_size+2),
                               fg_size+1, p_j).sum()
            if pvalue == 1.0:
                logging.info("Meaningless p-value: p_j=%s, fg=%s vs. bg=%s",
                             p_j, fg_tot, bg_counts)
        hits.append((faa, baa, pvalue))
    return hits
예제 #3
0
def cat_sub_consenses(task):
    """Concatenate the subfamily consensus sequences."""
    with open(task.target, 'w+') as outfile:
        for subaln in ext(task.depends, 'aln'):
            aln = AlignIO.read(str(subaln), 'clustal')
            outfile.write(">%s consensus\n" % noext(subaln))
            outfile.write(consensus.consensus(aln, trim_ends=False,
                                              gap_threshold=0.6) + "\n")
            # Group profiles: include the subfamily consenses, too
            if isdir(noext(subaln)):
                with open(ext(subaln, 'fasta')) as subfam_file:
                    outfile.write(subfam_file.read())
예제 #4
0
파일: core.py 프로젝트: etal/cladecompare
def process_pair(fg_aln, bg_aln, module, do_weight):
    """Calculate a mapping of alignment column positions to "contrast".

    Return a list of tuples:
        (foreground consensus aa, background consensus aa, p-value)
        for each column position.
    """
    fg_aln, bg_aln = clean_alignments(fg_aln, bg_aln)
    if do_weight:
        fg_weights = alnutils.sequence_weights(fg_aln, 'none')
        bg_weights = alnutils.sequence_weights(bg_aln, 'none')
    else:
        fg_weights = [1 for i in range(len(fg_aln))]
        bg_weights = [1 for i in range(len(bg_aln))]
    fg_size = fsum(fg_weights) if module != urn else len(fg_aln)
    bg_size = fsum(bg_weights)
    # Overall aa freqs for pseudocounts
    aa_freqs = combined_frequencies(fg_aln, fg_weights, bg_aln, bg_weights)
    fg_cons = consensus.consensus(fg_aln,
                                  weights=fg_weights,
                                  trim_ends=False,
                                  gap_threshold=GAP_THRESH)
    bg_cons = consensus.consensus(bg_aln,
                                  weights=bg_weights,
                                  trim_ends=False,
                                  gap_threshold=GAP_THRESH)

    hits = []
    for faa, baa, fg_col, bg_col in zip(fg_cons, bg_cons, zip(*fg_aln),
                                        zip(*bg_aln)):
        if faa == '-' or baa == '-':
            # Ignore indel columns -- there are better ways to look at these
            pvalue = 1.
        else:
            pvalue = module.compare_cols(fg_col, faa, fg_size, fg_weights,
                                         bg_col, baa, bg_size, bg_weights,
                                         aa_freqs, PSEUDO_SIZE)
        hits.append((faa, baa, pvalue))

    return fg_aln, bg_aln, hits
예제 #5
0
파일: core.py 프로젝트: zruan/cladecompare
def process_pair(fg_aln, bg_aln, module, nw):
    """Calculate a mapping of alignment column positions to "contrast".

    Return a list of tuples:
        (foreground consensus aa, background consensus aa, p-value)
        for each column position.
    """
    fg_aln, bg_aln = clean_alignments(fg_aln, bg_aln)
    if nw:
        fg_weights = list(1 for i in range(len(fg_aln)))
        bg_weights = list(1 for i in range(len(bg_aln)))
    else:
        fg_weights = alnutils.sequence_weights(fg_aln, 'none')
        bg_weights = alnutils.sequence_weights(bg_aln, 'none')
    fg_size = sum(fg_weights) if module != urn else len(fg_aln)
    bg_size = sum(bg_weights)
    # Overall aa freqs for pseudocounts
    aa_freqs = combined_frequencies(fg_aln, fg_weights, bg_aln, bg_weights)
    fg_cons = consensus.consensus(fg_aln, weights=fg_weights, trim_ends=False,
                                  gap_threshold=GAP_THRESH)
    bg_cons = consensus.consensus(bg_aln, weights=bg_weights, trim_ends=False,
                                  gap_threshold=GAP_THRESH)

    hits = []
    for faa, baa, fg_col, bg_col in zip(fg_cons, bg_cons,
                                        zip(*fg_aln), zip(*bg_aln)):
        if faa == '-' or baa == '-':
            # Ignore indel columns -- there are better ways to look at these
            pvalue = 1.
        else:
            pvalue = module.compare_cols(
                fg_col, faa, fg_size, fg_weights,
                bg_col, baa, bg_size, bg_weights,
                aa_freqs, PSEUDO_SIZE)
        hits.append((faa, baa, pvalue))

    return fg_aln, bg_aln, hits
예제 #6
0
def mg_aln2cma(task, level=None):
    """Convert an alignment profile to CMA (or .tpl).

    Depends: .aln
    Cleans: .cons.cma, .cons_iron.cma
    """
    base = noext(task.target)
    name = basename(base)
    # Add consensus back to the subfamily-consensus seq set (.aln)
    # to produce a CMA (.cons.cma)
    aln = AlignIO.read(str(task.depends[0]), 'clustal')
    cons_rec = SeqRecord(Seq(consensus.consensus(aln, trim_ends=False,
                                                 gap_threshold=0.6)),
                         id=name, description=name + ' consensus')
    aln._records.insert(0, cons_rec)
    # Tidy up the CMA
    cmaln = biocma.ChainMultiAlignment(aln, level=level)
    biocma.write([cmaln], task.target, do_iron=True)
예제 #7
0
파일: core.py 프로젝트: zruan/cladecompare
def process_one(aln, module, nw):
    """Calculate a mapping of alignment column positions to "contrast"."""
    if nw:
        weights = list(1 for i in range(len(aln)))
    else:
        weights = alnutils.sequence_weights(aln, 'none')
                                        # if module != jsd else 'sum1')
    aln_size = sum(weights) if module != urn else len(aln)
    aa_freqs = alnutils.aa_frequencies(aln, weights, gap_chars='-.X')
    cons = consensus.consensus(aln, weights=weights, trim_ends=False,
                               gap_threshold=GAP_THRESH)
    hits = []
    for cons_aa, col in zip(cons, zip(*aln)):
        if cons_aa == '-':
            # Ignore indel columns -- there are better ways to look at these
            pvalue = 1.
        else:
            pvalue = module.compare_one(col, cons_aa, aln_size, weights,
                                        aa_freqs, PSEUDO_SIZE)
        hits.append((cons_aa, '_', pvalue))
    return aln, hits
예제 #8
0
파일: core.py 프로젝트: etal/cladecompare
def process_one(aln, module, do_weight):
    """Calculate a mapping of alignment column positions to "contrast"."""
    if do_weight:
        weights = alnutils.sequence_weights(aln, 'none')
        # if module != jsd else 'sum1')
    else:
        weights = [1 for i in range(len(aln))]
    aln_size = fsum(weights) if module != urn else len(aln)
    aa_freqs = alnutils.aa_frequencies(aln, weights, gap_chars='-.X')
    cons = consensus.consensus(aln,
                               weights=weights,
                               trim_ends=False,
                               gap_threshold=GAP_THRESH)
    hits = []
    for cons_aa, col in zip(cons, zip(*aln)):
        if cons_aa == '-':
            # Ignore indel columns -- there are better ways to look at these
            pvalue = 1.
        else:
            pvalue = module.compare_one(col, cons_aa, aln_size, weights,
                                        aa_freqs, PSEUDO_SIZE)
        hits.append((cons_aa, '_', pvalue))
    return aln, hits
예제 #9
0
def align_profiles(task, use_pdb=None):
    """Align several FASTA files with MAFFT. Clustal output.

    Cleans: [depends].cons.seq, [target].families.fa, [target].families.seq,
        [target].seq
    """
    seeds, singles = [], []
    # PDB alignment -- include as a seed profile if requested
    subalignments, pdb_seed = task.depends[:-1], str(task.depends[-1])
    if use_pdb and not is_empty(pdb_seed):
        seeds.append(pdb_seed)
    else:
        logging.info("Empty PDB alignment: %s", pdb_seed)

    # Get subfamily and subgroup consensus sequences/profiles
    for subaln in subalignments:
        aln = AlignIO.read(str(subaln), 'clustal')
        # with open(task.target, 'w+') as outfile:
        with open(ext(subaln, 'cons.seq'), 'w+') as outfile:
            outfile.write(">%s consensus\n" % basename(noext(subaln)))
            cons_seq = consensus.consensus(aln,
                                           trim_ends=False,
                                           gap_threshold=0.6)
            if isdir(noext(subaln)):
                # Group profiles: include the subfamily consenses, too
                outfile.write(cons_seq + "\n")
                for record in aln:
                    outfile.write(">%s\n" % record.id)
                    outfile.write("%s\n" % record.seq)
            else:
                # Ungapped family consensus sequences
                outfile.write(cons_seq.replace('-', '') + "\n")

    # Merge the sequences and profiles
    for subconsseq in ext(subalignments, 'cons.seq'):
        if isdir(subconsseq[:-9]):
            # Group
            seeds.append(subconsseq)
        else:
            singles.append(subconsseq)
    # First, align/merge the single family consensus sequences
    famfa = ext(task.target, 'families.fa')
    allseq = ext(task.target, 'seq')
    assert singles or seeds, \
            'No .fasta files found to build %s' % task.target
    if singles:
        sh("cat %s > %s" % (' '.join(singles), famfa))
    if seeds:
        # Align the families with the groups
        sh("mafft --quiet --amino --maxiterate 1000 %s %s > %s" %
           (' '.join(['--seed ' + s for s in seeds]), famfa, allseq))
        # XXX fast version
        # sh("mafft --quiet --amino --auto %s %s > %s"
        #    % (' '.join(['--seed '+s for s in seeds]), famfa, allseq))
    else:
        # No group profiles -- just align the families
        sh("mafft --quiet --amino --maxiterate 1000 %s > %s" % (famfa, allseq))
    # Convert FASTA to "pressed" (single-row) Clustal
    records = [
        rec for rec in SeqIO.parse(allseq, 'fasta')
        # Drop PDB-derived sequences
        # if ':' not in rec.id
        if 'TMalign' not in rec.description
        and 'TM-score' not in rec.description and not rec.id.endswith('.pdb')
    ]
    records = list(alnutils.remove_empty_cols(records))
    if seeds:
        # MAFFT prefixes seed alignments with '_seed_' -- get rid of that
        for rec in records:
            if rec.id.startswith('_seed_'):
                rec.id = rec.id[6:]
    try:
        max_id_len = max(len(r.id) for r in records)
    except ValueError:
        # Common effup
        raise ValueError("Profile alignment failed for %s.\nInputs: %s" %
                         (task.target, ' '.join(map(str, task.depends))))

    with open(task.target, 'w+') as outfile:
        outfile.write('CLUSTAL X (-like) multiple sequence alignment\n\n')
        outfile.writelines([
            '%s %s\n' % (rec.id.ljust(max_id_len), rec.seq) for rec in records
        ])
예제 #10
0
파일: build.py 프로젝트: Tsingke/fammer
def align_profiles(task, use_pdb=None):
    """Align several FASTA files with MAFFT. Clustal output.

    Cleans: [depends].cons.seq, [target].families.fa, [target].families.seq,
        [target].seq
    """
    seeds, singles = [], []
    # PDB alignment -- include as a seed profile if requested
    subalignments, pdb_seed = task.depends[:-1], str(task.depends[-1])
    if use_pdb and not is_empty(pdb_seed):
        seeds.append(pdb_seed)
    else:
        logging.info("Empty PDB alignment: %s", pdb_seed)

    # Get subfamily and subgroup consensus sequences/profiles
    for subaln in subalignments:
        aln = AlignIO.read(str(subaln), 'clustal')
        # with open(task.target, 'w+') as outfile:
        with open(ext(subaln, 'cons.seq'), 'w+') as outfile:
            outfile.write(">%s consensus\n" % basename(noext(subaln)))
            cons_seq = consensus.consensus(aln, trim_ends=False,
                                           gap_threshold=0.6)
            if isdir(noext(subaln)):
                # Group profiles: include the subfamily consenses, too
                outfile.write(cons_seq + "\n")
                for record in aln:
                    outfile.write(">%s\n" % record.id)
                    outfile.write("%s\n" % record.seq)
            else:
                # Ungapped family consensus sequences
                outfile.write(cons_seq.replace('-', '') + "\n")

    # Merge the sequences and profiles
    for subconsseq in ext(subalignments, 'cons.seq'):
        if isdir(subconsseq[:-9]):
            # Group
            seeds.append(subconsseq)
        else:
            singles.append(subconsseq)
    # First, align/merge the single family consensus sequences
    famfa = ext(task.target, 'families.fa')
    allseq = ext(task.target, 'seq')
    assert singles or seeds, \
            'No .fasta files found to build %s' % task.target
    if singles:
        sh("cat %s > %s" % (' '.join(singles), famfa))
    if seeds:
        # Align the families with the groups
        sh("mafft --quiet --amino --globalgenafpair --maxiterate 1000 %s %s > %s"
           % (' '.join(['--seed '+s for s in seeds]), famfa, allseq))
        # XXX fast version
        # sh("mafft --quiet --amino --auto %s %s > %s"
        #    % (' '.join(['--seed '+s for s in seeds]), famfa, allseq))
    else:
        # No group profiles -- just align the families
        sh("mafft --quiet --amino --globalgenafpair --maxiterate 1000 %s > %s"
                % (famfa, allseq))
    # Convert FASTA to "pressed" (single-row) Clustal
    records = [rec for rec in SeqIO.parse(allseq, 'fasta')
            # Drop PDB-derived sequences
            # if ':' not in rec.id
            if 'TMalign' not in rec.description and
               'TM-score' not in rec.description and
               not rec.id.endswith('.pdb')
            ]
    records = list(alnutils.remove_empty_cols(records))
    if seeds:
        # MAFFT prefixes seed alignments with '_seed_' -- get rid of that
        for rec in records:
            if rec.id.startswith('_seed_'):
                rec.id = rec.id[6:]
    try:
        max_id_len = max(len(r.id) for r in records)
    except ValueError:
        # Common effup
        raise ValueError("Profile alignment failed for %s.\nInputs: %s"
                         % (task.target, ' '.join(map(str, task.depends))))

    with open(task.target, 'w+') as outfile:
        outfile.write('CLUSTAL X (-like) multiple sequence alignment\n\n')
        outfile.writelines(
                ['%s %s\n' % (rec.id.ljust(max_id_len), rec.seq)
                 for rec in records])