Example #1
0
def combine_alignments(fg_aln, bg_aln):
    """Align FG and BG to each other so column numbers match.

    Uses MUSCLE for profile-profile alignment.
    """
    # This would be simpler with NamedTemporaryFile, but Windows doesn't allow
    # multiple open file handles on the same file, so here we are.
    afd, aseqfname = tempfile.mkstemp(text=True)
    os.close(afd)
    bfd, bseqfname = tempfile.mkstemp(text=True)
    os.close(bfd)
    try:
        AlignIO.write(fg_aln, aseqfname, 'fasta')
        AlignIO.write(bg_aln, bseqfname, 'fasta')
        output = call_quiet('muscle', '-profile', '-in1', aseqfname, '-in2',
                            bseqfname)
    finally:
        if os.path.exists(aseqfname):
            os.remove(aseqfname)
        if os.path.exists(bseqfname):
            os.remove(bseqfname)

    full_aln = AlignIO.read(StringIO(output), 'fasta')
    full_aln = MultipleSeqAlignment(alnutils.remove_empty_cols(full_aln),
                                    generic_protein)
    # Save a copy
    # ENH: choose a reasonable name
    AlignIO.write(full_aln, '_cc_combined.seq', 'fasta')
    logging.info("Wrote _cc_combined.seq")
    return full_aln
Example #2
0
def combine_alignments(fg_aln, bg_aln):
    """Align FG and BG to each other so column numbers match.

    Uses MUSCLE for profile-profile alignment.
    """
    # This would be simpler with NamedTemporaryFile, but Windows doesn't allow
    # multiple open file handles on the same file, so here we are.
    afd, aseqfname = tempfile.mkstemp(text=True)
    os.close(afd)
    bfd, bseqfname = tempfile.mkstemp(text=True)
    os.close(bfd)
    try:
        AlignIO.write(fg_aln, aseqfname, 'fasta')
        AlignIO.write(bg_aln, bseqfname, 'fasta')
        output = subprocess.check_output([
            'muscle', '-profile',
            '-in1', aseqfname,
            '-in2', bseqfname,
        ])
    finally:
        if os.path.exists(aseqfname):
            os.remove(aseqfname)
        if os.path.exists(bseqfname):
            os.remove(bseqfname)

    full_aln = AlignIO.read(StringIO(output), 'fasta')
    full_aln = MultipleSeqAlignment(alnutils.remove_empty_cols(full_aln),
                                    generic_protein)
    # Save a copy
    # ENH: choose a reasonable name
    AlignIO.write(full_aln, '_cc_combined.seq', 'fasta')
    logging.info("Wrote _cc_combined.seq")
    return full_aln
Example #3
0
def align_profiles(task, use_pdb=None):
    """Align several FASTA files with MAFFT. Clustal output.

    Cleans: [depends].cons.seq, [target].families.fa, [target].families.seq,
        [target].seq
    """
    seeds, singles = [], []
    # PDB alignment -- include as a seed profile if requested
    subalignments, pdb_seed = task.depends[:-1], str(task.depends[-1])
    if use_pdb and not is_empty(pdb_seed):
        seeds.append(pdb_seed)
    else:
        logging.info("Empty PDB alignment: %s", pdb_seed)

    # Get subfamily and subgroup consensus sequences/profiles
    for subaln in subalignments:
        aln = AlignIO.read(str(subaln), 'clustal')
        # with open(task.target, 'w+') as outfile:
        with open(ext(subaln, 'cons.seq'), 'w+') as outfile:
            outfile.write(">%s consensus\n" % basename(noext(subaln)))
            cons_seq = consensus.consensus(aln,
                                           trim_ends=False,
                                           gap_threshold=0.6)
            if isdir(noext(subaln)):
                # Group profiles: include the subfamily consenses, too
                outfile.write(cons_seq + "\n")
                for record in aln:
                    outfile.write(">%s\n" % record.id)
                    outfile.write("%s\n" % record.seq)
            else:
                # Ungapped family consensus sequences
                outfile.write(cons_seq.replace('-', '') + "\n")

    # Merge the sequences and profiles
    for subconsseq in ext(subalignments, 'cons.seq'):
        if isdir(subconsseq[:-9]):
            # Group
            seeds.append(subconsseq)
        else:
            singles.append(subconsseq)
    # First, align/merge the single family consensus sequences
    famfa = ext(task.target, 'families.fa')
    allseq = ext(task.target, 'seq')
    assert singles or seeds, \
            'No .fasta files found to build %s' % task.target
    if singles:
        sh("cat %s > %s" % (' '.join(singles), famfa))
    if seeds:
        # Align the families with the groups
        sh("mafft --quiet --amino --maxiterate 1000 %s %s > %s" %
           (' '.join(['--seed ' + s for s in seeds]), famfa, allseq))
        # XXX fast version
        # sh("mafft --quiet --amino --auto %s %s > %s"
        #    % (' '.join(['--seed '+s for s in seeds]), famfa, allseq))
    else:
        # No group profiles -- just align the families
        sh("mafft --quiet --amino --maxiterate 1000 %s > %s" % (famfa, allseq))
    # Convert FASTA to "pressed" (single-row) Clustal
    records = [
        rec for rec in SeqIO.parse(allseq, 'fasta')
        # Drop PDB-derived sequences
        # if ':' not in rec.id
        if 'TMalign' not in rec.description
        and 'TM-score' not in rec.description and not rec.id.endswith('.pdb')
    ]
    records = list(alnutils.remove_empty_cols(records))
    if seeds:
        # MAFFT prefixes seed alignments with '_seed_' -- get rid of that
        for rec in records:
            if rec.id.startswith('_seed_'):
                rec.id = rec.id[6:]
    try:
        max_id_len = max(len(r.id) for r in records)
    except ValueError:
        # Common effup
        raise ValueError("Profile alignment failed for %s.\nInputs: %s" %
                         (task.target, ' '.join(map(str, task.depends))))

    with open(task.target, 'w+') as outfile:
        outfile.write('CLUSTAL X (-like) multiple sequence alignment\n\n')
        outfile.writelines([
            '%s %s\n' % (rec.id.ljust(max_id_len), rec.seq) for rec in records
        ])
Example #4
0
def align_structs(pdb_fnames, seed_fnames=None):
    """Align multiple PDB structures using TM-align.

    Returns a list of aligned SeqRecords.
    """
    # 1. Align all-v-all structure pairs with TM-align
    allpairs = []
    for idx, ref_pdbfn in enumerate(pdb_fnames):
        assert ' ' not in ref_pdbfn
        for eqv_pdbfn in pdb_fnames[idx + 1:]:
            assert eqv_pdbfn != ref_pdbfn
            logging.info("Aligning %s to %s", eqv_pdbfn, ref_pdbfn)
            try:
                tm_output = subprocess.check_output(
                    ['TMalign', ref_pdbfn, eqv_pdbfn])
            except OSError:
                logging.warning("Failed command: TMalign %s %s", ref_pdbfn,
                                eqv_pdbfn)
                for fname in (ref_pdbfn, eqv_pdbfn):
                    if not os.path.isfile(fname):
                        logging.warning("Missing file: %s", fname)
                raise
            #except subprocess.CalledProcessError, exc:
            #    raise RuntimeError("TMalign failed (returned %s):\n%s"
            #                       % (exc.returncode, exc.output))
            tm_seqpair = read_tmalign_as_seqrec_pair(tm_output, ref_pdbfn,
                                                     eqv_pdbfn)
            allpairs.append(tm_seqpair)

    # In case of 2 structs, no need to combine alignments -- we're done
    if len(allpairs) == 1:
        recs = allpairs[0][:2]
        return alnutils.remove_empty_cols(recs)

    # 2. Resolve MST pairs & write seed tempfiles
    tmp_seed_fnames = []
    allpairs_dict = {(i[0].id, i[1].id): i for i in allpairs}
    for seedpairid in mst_pairs(allpairs):
        seedpair = allpairs_dict[seedpairid]
        # fd, seedfn = tempfile.mkstemp(text=True)
        # SeqIO.write(seedpair, seedfn, 'fasta')
        # SeqIO.write(seedpair, os.fdopen(fd), 'fasta')
        with tempfile.NamedTemporaryFile('w+', delete=False) as handle:
            SeqIO.write([seedpair[0], seedpair[1]], handle, 'fasta')
            #AlignIO.write(MultipleSeqAlignment(seedpair), handle, 'fasta')
            tmp_seed_fnames.append(handle.name)

    # 3. Use MAFFT to combine TMalign'd pairs into a multiple alignment;
    seq_fd, seq_fname = tempfile.mkstemp(text=True)
    # Create a blank file to appease MAFFT
    os.write(seq_fd, b'')
    os.close(seq_fd)
    mafft_output = subprocess.check_output([
        'mafft', '--quiet', '--amino', '--localpair', '--maxiterate', '1000'
    ] + list(
        itertools.chain(*[('--seed', sfn)
                          for sfn in (seed_fnames or []) + tmp_seed_fnames])) +
                                           [seq_fname])
    # Clean up
    os.remove(seq_fname)
    for sfn in tmp_seed_fnames:
        os.remove(sfn)

    # 4. Emit the aligned sequences
    recs = SeqIO.parse(StringIO(mafft_output.decode("utf-8")), 'fasta')
    recs = clean_and_dedupe_seqs(recs)
    recs = alnutils.remove_empty_cols(recs)
    recs = purge_seqs(recs)
    return list(recs)
Example #5
0
def align_profiles(task, use_pdb=None):
    """Align several FASTA files with MAFFT. Clustal output.

    Cleans: [depends].cons.seq, [target].families.fa, [target].families.seq,
        [target].seq
    """
    seeds, singles = [], []
    # PDB alignment -- include as a seed profile if requested
    subalignments, pdb_seed = task.depends[:-1], str(task.depends[-1])
    if use_pdb and not is_empty(pdb_seed):
        seeds.append(pdb_seed)
    else:
        logging.info("Empty PDB alignment: %s", pdb_seed)

    # Get subfamily and subgroup consensus sequences/profiles
    for subaln in subalignments:
        aln = AlignIO.read(str(subaln), 'clustal')
        # with open(task.target, 'w+') as outfile:
        with open(ext(subaln, 'cons.seq'), 'w+') as outfile:
            outfile.write(">%s consensus\n" % basename(noext(subaln)))
            cons_seq = consensus.consensus(aln, trim_ends=False,
                                           gap_threshold=0.6)
            if isdir(noext(subaln)):
                # Group profiles: include the subfamily consenses, too
                outfile.write(cons_seq + "\n")
                for record in aln:
                    outfile.write(">%s\n" % record.id)
                    outfile.write("%s\n" % record.seq)
            else:
                # Ungapped family consensus sequences
                outfile.write(cons_seq.replace('-', '') + "\n")

    # Merge the sequences and profiles
    for subconsseq in ext(subalignments, 'cons.seq'):
        if isdir(subconsseq[:-9]):
            # Group
            seeds.append(subconsseq)
        else:
            singles.append(subconsseq)
    # First, align/merge the single family consensus sequences
    famfa = ext(task.target, 'families.fa')
    allseq = ext(task.target, 'seq')
    assert singles or seeds, \
            'No .fasta files found to build %s' % task.target
    if singles:
        sh("cat %s > %s" % (' '.join(singles), famfa))
    if seeds:
        # Align the families with the groups
        sh("mafft --quiet --amino --globalgenafpair --maxiterate 1000 %s %s > %s"
           % (' '.join(['--seed '+s for s in seeds]), famfa, allseq))
        # XXX fast version
        # sh("mafft --quiet --amino --auto %s %s > %s"
        #    % (' '.join(['--seed '+s for s in seeds]), famfa, allseq))
    else:
        # No group profiles -- just align the families
        sh("mafft --quiet --amino --globalgenafpair --maxiterate 1000 %s > %s"
                % (famfa, allseq))
    # Convert FASTA to "pressed" (single-row) Clustal
    records = [rec for rec in SeqIO.parse(allseq, 'fasta')
            # Drop PDB-derived sequences
            # if ':' not in rec.id
            if 'TMalign' not in rec.description and
               'TM-score' not in rec.description and
               not rec.id.endswith('.pdb')
            ]
    records = list(alnutils.remove_empty_cols(records))
    if seeds:
        # MAFFT prefixes seed alignments with '_seed_' -- get rid of that
        for rec in records:
            if rec.id.startswith('_seed_'):
                rec.id = rec.id[6:]
    try:
        max_id_len = max(len(r.id) for r in records)
    except ValueError:
        # Common effup
        raise ValueError("Profile alignment failed for %s.\nInputs: %s"
                         % (task.target, ' '.join(map(str, task.depends))))

    with open(task.target, 'w+') as outfile:
        outfile.write('CLUSTAL X (-like) multiple sequence alignment\n\n')
        outfile.writelines(
                ['%s %s\n' % (rec.id.ljust(max_id_len), rec.seq)
                 for rec in records])
Example #6
0
                                ref_pdbfn, eqv_pdbfn)
                for fname in (ref_pdbfn, eqv_pdbfn):
                    if not os.path.isfile(fname):
                        logging.warning("Missing file: %s", fname)
                raise
            except subprocess.CalledProcessError, exc:
                raise RuntimeError("TMalign failed (returned %s):\n%s"
                                   % (exc.returncode, exc.output))
            tm_seqpair = read_tmalign_as_seqrec_pair(tm_output,
                                                     ref_pdbfn, eqv_pdbfn)
            allpairs.append(tm_seqpair)

    # In case of 2 structs, no need to combine alignments -- we're done
    if len(allpairs) == 1:
        recs = allpairs[0][:2]
        return alnutils.remove_empty_cols(recs)

    # 2. Resolve MST pairs & write seed tempfiles
    tmp_seed_fnames = []
    for seedpair in mst_pairs(allpairs):
        # fd, seedfn = tempfile.mkstemp(text=True)
        # SeqIO.write(seedpair, seedfn, 'fasta')
        # SeqIO.write(seedpair, os.fdopen(fd), 'fasta')
        with tempfile.NamedTemporaryFile('w+', delete=False) as handle:
            SeqIO.write(seedpair, handle, 'fasta')
            tmp_seed_fnames.append(handle.name)

    # 3. Use MAFFT to combine TMalign'd pairs into a multiple alignment;
    seq_fd, seq_fname = tempfile.mkstemp(text=True)
    # Create a blank file to appease MAFFT
    os.write(seq_fd, '')