def combine_alignments(fg_aln, bg_aln): """Align FG and BG to each other so column numbers match. Uses MUSCLE for profile-profile alignment. """ # This would be simpler with NamedTemporaryFile, but Windows doesn't allow # multiple open file handles on the same file, so here we are. afd, aseqfname = tempfile.mkstemp(text=True) os.close(afd) bfd, bseqfname = tempfile.mkstemp(text=True) os.close(bfd) try: AlignIO.write(fg_aln, aseqfname, 'fasta') AlignIO.write(bg_aln, bseqfname, 'fasta') output = call_quiet('muscle', '-profile', '-in1', aseqfname, '-in2', bseqfname) finally: if os.path.exists(aseqfname): os.remove(aseqfname) if os.path.exists(bseqfname): os.remove(bseqfname) full_aln = AlignIO.read(StringIO(output), 'fasta') full_aln = MultipleSeqAlignment(alnutils.remove_empty_cols(full_aln), generic_protein) # Save a copy # ENH: choose a reasonable name AlignIO.write(full_aln, '_cc_combined.seq', 'fasta') logging.info("Wrote _cc_combined.seq") return full_aln
def combine_alignments(fg_aln, bg_aln): """Align FG and BG to each other so column numbers match. Uses MUSCLE for profile-profile alignment. """ # This would be simpler with NamedTemporaryFile, but Windows doesn't allow # multiple open file handles on the same file, so here we are. afd, aseqfname = tempfile.mkstemp(text=True) os.close(afd) bfd, bseqfname = tempfile.mkstemp(text=True) os.close(bfd) try: AlignIO.write(fg_aln, aseqfname, 'fasta') AlignIO.write(bg_aln, bseqfname, 'fasta') output = subprocess.check_output([ 'muscle', '-profile', '-in1', aseqfname, '-in2', bseqfname, ]) finally: if os.path.exists(aseqfname): os.remove(aseqfname) if os.path.exists(bseqfname): os.remove(bseqfname) full_aln = AlignIO.read(StringIO(output), 'fasta') full_aln = MultipleSeqAlignment(alnutils.remove_empty_cols(full_aln), generic_protein) # Save a copy # ENH: choose a reasonable name AlignIO.write(full_aln, '_cc_combined.seq', 'fasta') logging.info("Wrote _cc_combined.seq") return full_aln
def align_profiles(task, use_pdb=None): """Align several FASTA files with MAFFT. Clustal output. Cleans: [depends].cons.seq, [target].families.fa, [target].families.seq, [target].seq """ seeds, singles = [], [] # PDB alignment -- include as a seed profile if requested subalignments, pdb_seed = task.depends[:-1], str(task.depends[-1]) if use_pdb and not is_empty(pdb_seed): seeds.append(pdb_seed) else: logging.info("Empty PDB alignment: %s", pdb_seed) # Get subfamily and subgroup consensus sequences/profiles for subaln in subalignments: aln = AlignIO.read(str(subaln), 'clustal') # with open(task.target, 'w+') as outfile: with open(ext(subaln, 'cons.seq'), 'w+') as outfile: outfile.write(">%s consensus\n" % basename(noext(subaln))) cons_seq = consensus.consensus(aln, trim_ends=False, gap_threshold=0.6) if isdir(noext(subaln)): # Group profiles: include the subfamily consenses, too outfile.write(cons_seq + "\n") for record in aln: outfile.write(">%s\n" % record.id) outfile.write("%s\n" % record.seq) else: # Ungapped family consensus sequences outfile.write(cons_seq.replace('-', '') + "\n") # Merge the sequences and profiles for subconsseq in ext(subalignments, 'cons.seq'): if isdir(subconsseq[:-9]): # Group seeds.append(subconsseq) else: singles.append(subconsseq) # First, align/merge the single family consensus sequences famfa = ext(task.target, 'families.fa') allseq = ext(task.target, 'seq') assert singles or seeds, \ 'No .fasta files found to build %s' % task.target if singles: sh("cat %s > %s" % (' '.join(singles), famfa)) if seeds: # Align the families with the groups sh("mafft --quiet --amino --maxiterate 1000 %s %s > %s" % (' '.join(['--seed ' + s for s in seeds]), famfa, allseq)) # XXX fast version # sh("mafft --quiet --amino --auto %s %s > %s" # % (' '.join(['--seed '+s for s in seeds]), famfa, allseq)) else: # No group profiles -- just align the families sh("mafft --quiet --amino --maxiterate 1000 %s > %s" % (famfa, allseq)) # Convert FASTA to "pressed" (single-row) Clustal records = [ rec for rec in SeqIO.parse(allseq, 'fasta') # Drop PDB-derived sequences # if ':' not in rec.id if 'TMalign' not in rec.description and 'TM-score' not in rec.description and not rec.id.endswith('.pdb') ] records = list(alnutils.remove_empty_cols(records)) if seeds: # MAFFT prefixes seed alignments with '_seed_' -- get rid of that for rec in records: if rec.id.startswith('_seed_'): rec.id = rec.id[6:] try: max_id_len = max(len(r.id) for r in records) except ValueError: # Common effup raise ValueError("Profile alignment failed for %s.\nInputs: %s" % (task.target, ' '.join(map(str, task.depends)))) with open(task.target, 'w+') as outfile: outfile.write('CLUSTAL X (-like) multiple sequence alignment\n\n') outfile.writelines([ '%s %s\n' % (rec.id.ljust(max_id_len), rec.seq) for rec in records ])
def align_structs(pdb_fnames, seed_fnames=None): """Align multiple PDB structures using TM-align. Returns a list of aligned SeqRecords. """ # 1. Align all-v-all structure pairs with TM-align allpairs = [] for idx, ref_pdbfn in enumerate(pdb_fnames): assert ' ' not in ref_pdbfn for eqv_pdbfn in pdb_fnames[idx + 1:]: assert eqv_pdbfn != ref_pdbfn logging.info("Aligning %s to %s", eqv_pdbfn, ref_pdbfn) try: tm_output = subprocess.check_output( ['TMalign', ref_pdbfn, eqv_pdbfn]) except OSError: logging.warning("Failed command: TMalign %s %s", ref_pdbfn, eqv_pdbfn) for fname in (ref_pdbfn, eqv_pdbfn): if not os.path.isfile(fname): logging.warning("Missing file: %s", fname) raise #except subprocess.CalledProcessError, exc: # raise RuntimeError("TMalign failed (returned %s):\n%s" # % (exc.returncode, exc.output)) tm_seqpair = read_tmalign_as_seqrec_pair(tm_output, ref_pdbfn, eqv_pdbfn) allpairs.append(tm_seqpair) # In case of 2 structs, no need to combine alignments -- we're done if len(allpairs) == 1: recs = allpairs[0][:2] return alnutils.remove_empty_cols(recs) # 2. Resolve MST pairs & write seed tempfiles tmp_seed_fnames = [] allpairs_dict = {(i[0].id, i[1].id): i for i in allpairs} for seedpairid in mst_pairs(allpairs): seedpair = allpairs_dict[seedpairid] # fd, seedfn = tempfile.mkstemp(text=True) # SeqIO.write(seedpair, seedfn, 'fasta') # SeqIO.write(seedpair, os.fdopen(fd), 'fasta') with tempfile.NamedTemporaryFile('w+', delete=False) as handle: SeqIO.write([seedpair[0], seedpair[1]], handle, 'fasta') #AlignIO.write(MultipleSeqAlignment(seedpair), handle, 'fasta') tmp_seed_fnames.append(handle.name) # 3. Use MAFFT to combine TMalign'd pairs into a multiple alignment; seq_fd, seq_fname = tempfile.mkstemp(text=True) # Create a blank file to appease MAFFT os.write(seq_fd, b'') os.close(seq_fd) mafft_output = subprocess.check_output([ 'mafft', '--quiet', '--amino', '--localpair', '--maxiterate', '1000' ] + list( itertools.chain(*[('--seed', sfn) for sfn in (seed_fnames or []) + tmp_seed_fnames])) + [seq_fname]) # Clean up os.remove(seq_fname) for sfn in tmp_seed_fnames: os.remove(sfn) # 4. Emit the aligned sequences recs = SeqIO.parse(StringIO(mafft_output.decode("utf-8")), 'fasta') recs = clean_and_dedupe_seqs(recs) recs = alnutils.remove_empty_cols(recs) recs = purge_seqs(recs) return list(recs)
def align_profiles(task, use_pdb=None): """Align several FASTA files with MAFFT. Clustal output. Cleans: [depends].cons.seq, [target].families.fa, [target].families.seq, [target].seq """ seeds, singles = [], [] # PDB alignment -- include as a seed profile if requested subalignments, pdb_seed = task.depends[:-1], str(task.depends[-1]) if use_pdb and not is_empty(pdb_seed): seeds.append(pdb_seed) else: logging.info("Empty PDB alignment: %s", pdb_seed) # Get subfamily and subgroup consensus sequences/profiles for subaln in subalignments: aln = AlignIO.read(str(subaln), 'clustal') # with open(task.target, 'w+') as outfile: with open(ext(subaln, 'cons.seq'), 'w+') as outfile: outfile.write(">%s consensus\n" % basename(noext(subaln))) cons_seq = consensus.consensus(aln, trim_ends=False, gap_threshold=0.6) if isdir(noext(subaln)): # Group profiles: include the subfamily consenses, too outfile.write(cons_seq + "\n") for record in aln: outfile.write(">%s\n" % record.id) outfile.write("%s\n" % record.seq) else: # Ungapped family consensus sequences outfile.write(cons_seq.replace('-', '') + "\n") # Merge the sequences and profiles for subconsseq in ext(subalignments, 'cons.seq'): if isdir(subconsseq[:-9]): # Group seeds.append(subconsseq) else: singles.append(subconsseq) # First, align/merge the single family consensus sequences famfa = ext(task.target, 'families.fa') allseq = ext(task.target, 'seq') assert singles or seeds, \ 'No .fasta files found to build %s' % task.target if singles: sh("cat %s > %s" % (' '.join(singles), famfa)) if seeds: # Align the families with the groups sh("mafft --quiet --amino --globalgenafpair --maxiterate 1000 %s %s > %s" % (' '.join(['--seed '+s for s in seeds]), famfa, allseq)) # XXX fast version # sh("mafft --quiet --amino --auto %s %s > %s" # % (' '.join(['--seed '+s for s in seeds]), famfa, allseq)) else: # No group profiles -- just align the families sh("mafft --quiet --amino --globalgenafpair --maxiterate 1000 %s > %s" % (famfa, allseq)) # Convert FASTA to "pressed" (single-row) Clustal records = [rec for rec in SeqIO.parse(allseq, 'fasta') # Drop PDB-derived sequences # if ':' not in rec.id if 'TMalign' not in rec.description and 'TM-score' not in rec.description and not rec.id.endswith('.pdb') ] records = list(alnutils.remove_empty_cols(records)) if seeds: # MAFFT prefixes seed alignments with '_seed_' -- get rid of that for rec in records: if rec.id.startswith('_seed_'): rec.id = rec.id[6:] try: max_id_len = max(len(r.id) for r in records) except ValueError: # Common effup raise ValueError("Profile alignment failed for %s.\nInputs: %s" % (task.target, ' '.join(map(str, task.depends)))) with open(task.target, 'w+') as outfile: outfile.write('CLUSTAL X (-like) multiple sequence alignment\n\n') outfile.writelines( ['%s %s\n' % (rec.id.ljust(max_id_len), rec.seq) for rec in records])
ref_pdbfn, eqv_pdbfn) for fname in (ref_pdbfn, eqv_pdbfn): if not os.path.isfile(fname): logging.warning("Missing file: %s", fname) raise except subprocess.CalledProcessError, exc: raise RuntimeError("TMalign failed (returned %s):\n%s" % (exc.returncode, exc.output)) tm_seqpair = read_tmalign_as_seqrec_pair(tm_output, ref_pdbfn, eqv_pdbfn) allpairs.append(tm_seqpair) # In case of 2 structs, no need to combine alignments -- we're done if len(allpairs) == 1: recs = allpairs[0][:2] return alnutils.remove_empty_cols(recs) # 2. Resolve MST pairs & write seed tempfiles tmp_seed_fnames = [] for seedpair in mst_pairs(allpairs): # fd, seedfn = tempfile.mkstemp(text=True) # SeqIO.write(seedpair, seedfn, 'fasta') # SeqIO.write(seedpair, os.fdopen(fd), 'fasta') with tempfile.NamedTemporaryFile('w+', delete=False) as handle: SeqIO.write(seedpair, handle, 'fasta') tmp_seed_fnames.append(handle.name) # 3. Use MAFFT to combine TMalign'd pairs into a multiple alignment; seq_fd, seq_fname = tempfile.mkstemp(text=True) # Create a blank file to appease MAFFT os.write(seq_fd, '')