def compare_aln(fg_aln, bg_aln): """Compare alignments using the ball-in-urn model. Like CHAIN does. """ # BG seqs are weighted, FG seqs are not bg_weights = alnutils.sequence_weights(bg_aln, 'none') bg_size = sum(bg_weights) bg_cons = consensus.consensus(bg_aln, weights=bg_weights) # Height of the foreground alignment column fg_size = len(fg_aln) fg_cons = consensus.consensus(fg_aln) fg_cols = zip(*fg_aln) bg_cols = zip(*bg_aln) fg_weights = [1] * fg_size pseudocounts = combined_frequencies(fg_aln, fg_weights, bg_aln, bg_weights) hits = [] for faa, baa, fg_col, bg_col in zip(fg_cons, bg_cons, fg_cols, bg_cols): if faa == '-' or baa == '-': # Ignore indel columns -- there are better ways to look at these pvalue = 1.0 else: # Cumulative binomial test # Number of consensus-type residues in the foreground column fg_counts = count_col(fg_col, fg_weights, pseudocounts) fg_tot = fg_counts['S'] + fg_counts['T'] + fg_counts['Y'] # Consensus residue frequency in the combined alignment column bg_counts = count_col(bg_col, bg_weights, pseudocounts) p_j = (bg_counts['S'] + bg_counts['T'] + bg_counts['Y'] + fg_tot ) / (bg_size + fg_size + 2.0) # pseudocount size = 1.0 # Probability of fg col conservation vs. the combined/main set # (P_j_LB in the publication) # NB: Some tweaks for pseudocounts pvalue = binom.pmf(range(int(math.ceil(fg_tot)), fg_size + 2), fg_size + 1, p_j).sum() if pvalue == 1.0: logging.info("Meaningless p-value: p_j=%s, fg=%s vs. bg=%s", p_j, fg_tot, bg_counts) hits.append((faa, baa, pvalue)) return hits
def compare_aln(fg_aln, bg_aln): """Compare alignments using the ball-in-urn model. Like CHAIN does. """ # BG seqs are weighted, FG seqs are not bg_weights = alnutils.sequence_weights(bg_aln, 'none') bg_size = sum(bg_weights) bg_cons = consensus.consensus(bg_aln, weights=bg_weights) # Height of the foreground alignment column fg_size = len(fg_aln) fg_cons = consensus.consensus(fg_aln) fg_cols = zip(*fg_aln) bg_cols = zip(*bg_aln) fg_weights = [1]*fg_size pseudocounts = combined_frequencies(fg_aln, fg_weights, bg_aln, bg_weights) hits = [] for faa, baa, fg_col, bg_col in zip(fg_cons, bg_cons, fg_cols, bg_cols): if faa == '-' or baa == '-': # Ignore indel columns -- there are better ways to look at these pvalue = 1.0 else: # Cumulative binomial test # Number of consensus-type residues in the foreground column fg_counts = count_col(fg_col, fg_weights, pseudocounts) fg_tot = fg_counts['S'] + fg_counts['T'] + fg_counts['Y'] # Consensus residue frequency in the combined alignment column bg_counts = count_col(bg_col, bg_weights, pseudocounts) p_j = (bg_counts['S'] + bg_counts['T'] + bg_counts['Y'] + fg_tot ) / (bg_size + fg_size + 2.0) # pseudocount size = 1.0 # Probability of fg col conservation vs. the combined/main set # (P_j_LB in the publication) # NB: Some tweaks for pseudocounts pvalue = binom.pmf(range(int(math.ceil(fg_tot)), fg_size+2), fg_size+1, p_j).sum() if pvalue == 1.0: logging.info("Meaningless p-value: p_j=%s, fg=%s vs. bg=%s", p_j, fg_tot, bg_counts) hits.append((faa, baa, pvalue)) return hits
def cat_sub_consenses(task): """Concatenate the subfamily consensus sequences.""" with open(task.target, 'w+') as outfile: for subaln in ext(task.depends, 'aln'): aln = AlignIO.read(str(subaln), 'clustal') outfile.write(">%s consensus\n" % noext(subaln)) outfile.write(consensus.consensus(aln, trim_ends=False, gap_threshold=0.6) + "\n") # Group profiles: include the subfamily consenses, too if isdir(noext(subaln)): with open(ext(subaln, 'fasta')) as subfam_file: outfile.write(subfam_file.read())
def process_pair(fg_aln, bg_aln, module, do_weight): """Calculate a mapping of alignment column positions to "contrast". Return a list of tuples: (foreground consensus aa, background consensus aa, p-value) for each column position. """ fg_aln, bg_aln = clean_alignments(fg_aln, bg_aln) if do_weight: fg_weights = alnutils.sequence_weights(fg_aln, 'none') bg_weights = alnutils.sequence_weights(bg_aln, 'none') else: fg_weights = [1 for i in range(len(fg_aln))] bg_weights = [1 for i in range(len(bg_aln))] fg_size = fsum(fg_weights) if module != urn else len(fg_aln) bg_size = fsum(bg_weights) # Overall aa freqs for pseudocounts aa_freqs = combined_frequencies(fg_aln, fg_weights, bg_aln, bg_weights) fg_cons = consensus.consensus(fg_aln, weights=fg_weights, trim_ends=False, gap_threshold=GAP_THRESH) bg_cons = consensus.consensus(bg_aln, weights=bg_weights, trim_ends=False, gap_threshold=GAP_THRESH) hits = [] for faa, baa, fg_col, bg_col in zip(fg_cons, bg_cons, zip(*fg_aln), zip(*bg_aln)): if faa == '-' or baa == '-': # Ignore indel columns -- there are better ways to look at these pvalue = 1. else: pvalue = module.compare_cols(fg_col, faa, fg_size, fg_weights, bg_col, baa, bg_size, bg_weights, aa_freqs, PSEUDO_SIZE) hits.append((faa, baa, pvalue)) return fg_aln, bg_aln, hits
def process_pair(fg_aln, bg_aln, module, nw): """Calculate a mapping of alignment column positions to "contrast". Return a list of tuples: (foreground consensus aa, background consensus aa, p-value) for each column position. """ fg_aln, bg_aln = clean_alignments(fg_aln, bg_aln) if nw: fg_weights = list(1 for i in range(len(fg_aln))) bg_weights = list(1 for i in range(len(bg_aln))) else: fg_weights = alnutils.sequence_weights(fg_aln, 'none') bg_weights = alnutils.sequence_weights(bg_aln, 'none') fg_size = sum(fg_weights) if module != urn else len(fg_aln) bg_size = sum(bg_weights) # Overall aa freqs for pseudocounts aa_freqs = combined_frequencies(fg_aln, fg_weights, bg_aln, bg_weights) fg_cons = consensus.consensus(fg_aln, weights=fg_weights, trim_ends=False, gap_threshold=GAP_THRESH) bg_cons = consensus.consensus(bg_aln, weights=bg_weights, trim_ends=False, gap_threshold=GAP_THRESH) hits = [] for faa, baa, fg_col, bg_col in zip(fg_cons, bg_cons, zip(*fg_aln), zip(*bg_aln)): if faa == '-' or baa == '-': # Ignore indel columns -- there are better ways to look at these pvalue = 1. else: pvalue = module.compare_cols( fg_col, faa, fg_size, fg_weights, bg_col, baa, bg_size, bg_weights, aa_freqs, PSEUDO_SIZE) hits.append((faa, baa, pvalue)) return fg_aln, bg_aln, hits
def mg_aln2cma(task, level=None): """Convert an alignment profile to CMA (or .tpl). Depends: .aln Cleans: .cons.cma, .cons_iron.cma """ base = noext(task.target) name = basename(base) # Add consensus back to the subfamily-consensus seq set (.aln) # to produce a CMA (.cons.cma) aln = AlignIO.read(str(task.depends[0]), 'clustal') cons_rec = SeqRecord(Seq(consensus.consensus(aln, trim_ends=False, gap_threshold=0.6)), id=name, description=name + ' consensus') aln._records.insert(0, cons_rec) # Tidy up the CMA cmaln = biocma.ChainMultiAlignment(aln, level=level) biocma.write([cmaln], task.target, do_iron=True)
def process_one(aln, module, nw): """Calculate a mapping of alignment column positions to "contrast".""" if nw: weights = list(1 for i in range(len(aln))) else: weights = alnutils.sequence_weights(aln, 'none') # if module != jsd else 'sum1') aln_size = sum(weights) if module != urn else len(aln) aa_freqs = alnutils.aa_frequencies(aln, weights, gap_chars='-.X') cons = consensus.consensus(aln, weights=weights, trim_ends=False, gap_threshold=GAP_THRESH) hits = [] for cons_aa, col in zip(cons, zip(*aln)): if cons_aa == '-': # Ignore indel columns -- there are better ways to look at these pvalue = 1. else: pvalue = module.compare_one(col, cons_aa, aln_size, weights, aa_freqs, PSEUDO_SIZE) hits.append((cons_aa, '_', pvalue)) return aln, hits
def process_one(aln, module, do_weight): """Calculate a mapping of alignment column positions to "contrast".""" if do_weight: weights = alnutils.sequence_weights(aln, 'none') # if module != jsd else 'sum1') else: weights = [1 for i in range(len(aln))] aln_size = fsum(weights) if module != urn else len(aln) aa_freqs = alnutils.aa_frequencies(aln, weights, gap_chars='-.X') cons = consensus.consensus(aln, weights=weights, trim_ends=False, gap_threshold=GAP_THRESH) hits = [] for cons_aa, col in zip(cons, zip(*aln)): if cons_aa == '-': # Ignore indel columns -- there are better ways to look at these pvalue = 1. else: pvalue = module.compare_one(col, cons_aa, aln_size, weights, aa_freqs, PSEUDO_SIZE) hits.append((cons_aa, '_', pvalue)) return aln, hits
def align_profiles(task, use_pdb=None): """Align several FASTA files with MAFFT. Clustal output. Cleans: [depends].cons.seq, [target].families.fa, [target].families.seq, [target].seq """ seeds, singles = [], [] # PDB alignment -- include as a seed profile if requested subalignments, pdb_seed = task.depends[:-1], str(task.depends[-1]) if use_pdb and not is_empty(pdb_seed): seeds.append(pdb_seed) else: logging.info("Empty PDB alignment: %s", pdb_seed) # Get subfamily and subgroup consensus sequences/profiles for subaln in subalignments: aln = AlignIO.read(str(subaln), 'clustal') # with open(task.target, 'w+') as outfile: with open(ext(subaln, 'cons.seq'), 'w+') as outfile: outfile.write(">%s consensus\n" % basename(noext(subaln))) cons_seq = consensus.consensus(aln, trim_ends=False, gap_threshold=0.6) if isdir(noext(subaln)): # Group profiles: include the subfamily consenses, too outfile.write(cons_seq + "\n") for record in aln: outfile.write(">%s\n" % record.id) outfile.write("%s\n" % record.seq) else: # Ungapped family consensus sequences outfile.write(cons_seq.replace('-', '') + "\n") # Merge the sequences and profiles for subconsseq in ext(subalignments, 'cons.seq'): if isdir(subconsseq[:-9]): # Group seeds.append(subconsseq) else: singles.append(subconsseq) # First, align/merge the single family consensus sequences famfa = ext(task.target, 'families.fa') allseq = ext(task.target, 'seq') assert singles or seeds, \ 'No .fasta files found to build %s' % task.target if singles: sh("cat %s > %s" % (' '.join(singles), famfa)) if seeds: # Align the families with the groups sh("mafft --quiet --amino --maxiterate 1000 %s %s > %s" % (' '.join(['--seed ' + s for s in seeds]), famfa, allseq)) # XXX fast version # sh("mafft --quiet --amino --auto %s %s > %s" # % (' '.join(['--seed '+s for s in seeds]), famfa, allseq)) else: # No group profiles -- just align the families sh("mafft --quiet --amino --maxiterate 1000 %s > %s" % (famfa, allseq)) # Convert FASTA to "pressed" (single-row) Clustal records = [ rec for rec in SeqIO.parse(allseq, 'fasta') # Drop PDB-derived sequences # if ':' not in rec.id if 'TMalign' not in rec.description and 'TM-score' not in rec.description and not rec.id.endswith('.pdb') ] records = list(alnutils.remove_empty_cols(records)) if seeds: # MAFFT prefixes seed alignments with '_seed_' -- get rid of that for rec in records: if rec.id.startswith('_seed_'): rec.id = rec.id[6:] try: max_id_len = max(len(r.id) for r in records) except ValueError: # Common effup raise ValueError("Profile alignment failed for %s.\nInputs: %s" % (task.target, ' '.join(map(str, task.depends)))) with open(task.target, 'w+') as outfile: outfile.write('CLUSTAL X (-like) multiple sequence alignment\n\n') outfile.writelines([ '%s %s\n' % (rec.id.ljust(max_id_len), rec.seq) for rec in records ])
def align_profiles(task, use_pdb=None): """Align several FASTA files with MAFFT. Clustal output. Cleans: [depends].cons.seq, [target].families.fa, [target].families.seq, [target].seq """ seeds, singles = [], [] # PDB alignment -- include as a seed profile if requested subalignments, pdb_seed = task.depends[:-1], str(task.depends[-1]) if use_pdb and not is_empty(pdb_seed): seeds.append(pdb_seed) else: logging.info("Empty PDB alignment: %s", pdb_seed) # Get subfamily and subgroup consensus sequences/profiles for subaln in subalignments: aln = AlignIO.read(str(subaln), 'clustal') # with open(task.target, 'w+') as outfile: with open(ext(subaln, 'cons.seq'), 'w+') as outfile: outfile.write(">%s consensus\n" % basename(noext(subaln))) cons_seq = consensus.consensus(aln, trim_ends=False, gap_threshold=0.6) if isdir(noext(subaln)): # Group profiles: include the subfamily consenses, too outfile.write(cons_seq + "\n") for record in aln: outfile.write(">%s\n" % record.id) outfile.write("%s\n" % record.seq) else: # Ungapped family consensus sequences outfile.write(cons_seq.replace('-', '') + "\n") # Merge the sequences and profiles for subconsseq in ext(subalignments, 'cons.seq'): if isdir(subconsseq[:-9]): # Group seeds.append(subconsseq) else: singles.append(subconsseq) # First, align/merge the single family consensus sequences famfa = ext(task.target, 'families.fa') allseq = ext(task.target, 'seq') assert singles or seeds, \ 'No .fasta files found to build %s' % task.target if singles: sh("cat %s > %s" % (' '.join(singles), famfa)) if seeds: # Align the families with the groups sh("mafft --quiet --amino --globalgenafpair --maxiterate 1000 %s %s > %s" % (' '.join(['--seed '+s for s in seeds]), famfa, allseq)) # XXX fast version # sh("mafft --quiet --amino --auto %s %s > %s" # % (' '.join(['--seed '+s for s in seeds]), famfa, allseq)) else: # No group profiles -- just align the families sh("mafft --quiet --amino --globalgenafpair --maxiterate 1000 %s > %s" % (famfa, allseq)) # Convert FASTA to "pressed" (single-row) Clustal records = [rec for rec in SeqIO.parse(allseq, 'fasta') # Drop PDB-derived sequences # if ':' not in rec.id if 'TMalign' not in rec.description and 'TM-score' not in rec.description and not rec.id.endswith('.pdb') ] records = list(alnutils.remove_empty_cols(records)) if seeds: # MAFFT prefixes seed alignments with '_seed_' -- get rid of that for rec in records: if rec.id.startswith('_seed_'): rec.id = rec.id[6:] try: max_id_len = max(len(r.id) for r in records) except ValueError: # Common effup raise ValueError("Profile alignment failed for %s.\nInputs: %s" % (task.target, ' '.join(map(str, task.depends)))) with open(task.target, 'w+') as outfile: outfile.write('CLUSTAL X (-like) multiple sequence alignment\n\n') outfile.writelines( ['%s %s\n' % (rec.id.ljust(max_id_len), rec.seq) for rec in records])