Ejemplo n.º 1
0
def alignTwoSequencesWithBiopython(seq1,
                                   seq2,
                                   match=5,
                                   mismatch=-1,
                                   gap_opening=-10,
                                   gap_extension=-1):
    """Easily align two sequences with Biopython's globalms.
    Returns an MSA and indices for use with showAlignment.
    """
    alignment = pairwise2.align.globalms(seq1, seq2, match, mismatch,
                                         gap_opening, gap_extension)

    seq_indices = [0]
    msa_indices = [0]

    for i in range(len(alignment[0][0])):
        if alignment[0][0][i] != '-':
            seq_indices.append(seq_indices[i] + 1)
        else:
            seq_indices.append(seq_indices[i])

        if alignment[0][1][i] != '-':
            msa_indices.append(msa_indices[i] + 1)
        else:
            msa_indices.append(msa_indices[i])

    seq_indices = array(seq_indices)
    msa_indices = array(msa_indices)

    alignment = MSA(msa=array([array(list(alignment[0][0])), \
                               array(list(alignment[0][1]))]))

    return alignment, seq_indices, msa_indices
Ejemplo n.º 2
0
def alignTwoSequencesWithBiopython(seq1, seq2, **kwargs):
    """Easily align two sequences with Biopython's globalms or localms.
    Returns an MSA and indices for use with :func:`.showAlignment`.

    Alignment parameters can be provided as keyword arguments. 
    Default values are as originally set in the proteins.compare 
    module, but now found in utilities.seqtools.

    :arg match: a positive integer, used to reward finding a match
    :type match: int
    
    :arg mismatch: a negative integer, used to penalise finding a mismatch
    :type mismatch: int
    
    :arg gap_opening: a negative integer, used to penalise opening a gap
    :type gap_opening: int
    
    :arg gap_extension: a negative integer, used to penalise extending a gap
    :type gap_extension: int

    :arg method: method for pairwise2 alignment. 
        Possible values are 'local' and 'global'
    :type method: str
    """

    match = kwargs.get('match', MATCH_SCORE)
    mismatch = kwargs.get('mismatch', MISMATCH_SCORE)
    gap_opening = kwargs.get('gap_opening', GAP_PENALTY)
    gap_extension = kwargs.get('gap_extension', GAP_EXT_PENALTY)
    method = kwargs.get('method', ALIGNMENT_METHOD)
    
    if method == 'local':
        alignment = pairwise2.align.localms(seq1, seq2, match, mismatch, gap_opening, gap_extension)
    elif method == 'global':
        alignment = pairwise2.align.globalms(seq1, seq2, match, mismatch, gap_opening, gap_extension)
    else:
        raise ValueError('method should be local or global')

    seq_indices = [0]
    msa_indices = [0]

    for i in range(len(alignment[0][0])):
        if alignment[0][0][i] != '-':
            seq_indices.append(seq_indices[i]+1)
        else:
            seq_indices.append(seq_indices[i])

        if alignment[0][1][i] != '-':
            msa_indices.append(msa_indices[i]+1)
        else:
            msa_indices.append(msa_indices[i])

    seq_indices = array(seq_indices)
    msa_indices = array(msa_indices)

    alignment = MSA(msa=array([array(list(alignment[0][0])), \
                               array(list(alignment[0][1]))]))

    return alignment, seq_indices, msa_indices
Ejemplo n.º 3
0
def alignSequenceToMSA(seq, msa, **kwargs):
    """
    Align a sequence from a PDB or Sequence to a sequence from an MSA
    and create two sets of indices. 

    The sequence from the MSA (refSeq), the alignment and 
    the two sets of indices are returned. 
    
    The first set (indices) maps the residue numbers in the PDB to 
    the reference sequence. The second set (msa_indices) indexes the 
    reference sequence in the msa and is used for retrieving values 
    from the first indices.

    :arg seq: an object with an associated sequence string 
         or a sequence string itself
    :type seq: :class:`Atomic`, :class:`Sequence`
    
    :arg msa: MSA object
    :type msa: :class:`.MSA`
    
    :arg label: a label for a sequence in msa or a PDB ID
        ``msa.getIndex(label)`` must return a sequence index
    :type label: str
    
    :arg chain: which chain from pdb to use for alignment, default is `'A'`
        This value will be ignored if seq is not an :class:`Atomic` object.
    :type chain: str
    
    :arg match: a positive integer, used to reward finding a match
        The default is 5, which we found to work in a test case.
    :type match: int
    
    :arg mismatch: a negative integer, used to penalise finding a mismatch
        The default is -1, which we found to work in a test case
    :type mismatch: int
    
    :arg gap_opening: a negative integer, used to penalise opening a gap
        The default is -10, which we found to work in a test case
    :type gap_opening: int
    
    :arg gap_extension: a negative integer, used to penalise extending a gap
        The default is -1, which we found to work in a test case
    :type gap_extension: int
    """
    label = kwargs.get('label', None)
    chain = kwargs.get('chain', 'A')
    match = kwargs.get('match', 5)
    mismatch = kwargs.get('mismatch', -1)
    gap_opening = kwargs.get('gap_opening', -10)
    gap_extension = kwargs.get('gap_extension', -1)

    if isinstance(seq, Atomic):
        if isinstance(chain, str):
            ag = seq.select('chain {0}'.format(chain))
        elif chain is None:
            ag = seq
        else:
            raise TypeError('chain should be a string or **None**')

        sequence = ag.select('ca').getSequence()
    elif isinstance(seq, Sequence):
        sequence = str(seq)
        ag = None
    else:
        raise TypeError(
            'seq must be an atomic class, sequence class, or str not {0}'.
            format(type(seq)))

    if not isinstance(msa, MSA):
        raise TypeError('msa must be an MSA instance')

    if label is None:
        if ag:
            label = ag.getTitle().split('_')[0]
        elif isinstance(seq, Sequence):
            label = seq.getLabel()
        else:
            raise ValueError(
                'A label cannot be extracted from seq so please provide one.')

    try:
        seqIndex = msa.getIndex(label)
    except:
        raise ValueError('Please provide a label that can be found in msa.')

    if isinstance(seqIndex, int):
        refMsaSeq = str(msa[seqIndex]).upper().replace('-', '.')

    else:
        raise TypeError(
            'The output from querying that label against msa is not a single sequence.'
        )

    alignment = pairwise2.align.globalms(sequence, str(refMsaSeq), \
                                         match, mismatch, gap_opening, gap_extension)

    seq_indices = [0]
    msa_indices = [0]

    for i in range(len(alignment[0][0])):
        if alignment[0][0][i] != '-':
            seq_indices.append(seq_indices[i] + 1)
        else:
            seq_indices.append(seq_indices[i])

        if alignment[0][1][i] != '-':
            msa_indices.append(msa_indices[i] + 1)
        else:
            msa_indices.append(msa_indices[i])

    seq_indices.pop(0)  # The first element was extra for initialisation
    msa_indices.pop(0)  # The first element was extra for initialisation

    seq_indices = array(seq_indices)
    msa_indices = array(msa_indices)

    alignment = MSA(msa=array([array(list(alignment[0][0])), \
                               array(list(alignment[0][1]))]), \
                    labels=[ag.getTitle(), label])

    return alignment, seq_indices, msa_indices
Ejemplo n.º 4
0
def buildMSA(sequences, title='Unknown', labels=None, **kwargs):
    """
    Aligns sequences with clustalw or clustalw2 and returns the resulting MSA.

    :arg sequences: a file, MSA object or a list or array containing sequences
       as Atomic objects with :func:`getSequence` or Sequence objects or strings. 
       If strings are used then labels must be provided using ``labels``
    :type sequences: :class:`Atomic`, :class:`.MSA`, 
        :class:`~numpy.ndarray`, str

    :arg title: the title for the MSA and it will be used as the prefix for output files.
    :type title: str

    :arg labels: a list of labels to go with the sequences
    :type labels: list

    :arg align: whether to align the sequences
        default True
    :type align: bool

    :arg method: alignment method, one of either biopython.align.globalms or clustalw(2).
        default 'clustalw'
    :type align: str
    """

    align = kwargs.get('align', True)
    method = kwargs.pop('method', 'clustalw')
    # 1. check if sequences are in a fasta file and if not make one
    if isinstance(sequences, str):
        filename = sequences
    elif not isinstance(sequences, MSA):
        try:
            max_len = 0
            for sequence in sequences:
                if isinstance(sequence, Atomic):
                    if len(sequence.ca.copy()) > max_len:
                        max_len = len(sequence.ca.copy())
                elif isinstance(sequence, MSA):
                    if len(sequence[0]) > max_len:
                        max_len = len(sequence[0])
                else:
                    if len(sequence) > max_len:
                        max_len = len(sequence)

            msa = []
            fetched_labels = []
            for i, sequence in enumerate(sequences):
                if isinstance(sequence, Atomic):
                    strseq = sequence.ca.getSequence()
                    label = sequence.getTitle()
                elif isinstance(sequence, Sequence):
                    strseq = str(sequence)
                    label = sequence.getLabel()
                elif isinstance(sequence, MSA):
                    strseq = str(sequence[0])
                    label = sequence.getLabel(0)
                    LOGGER.warn(
                        'Only the first sequence in the MSA at entry {0} is used.'
                        .format(i))
                elif isinstance(sequence, str):
                    strseq = sequence
                    label = str(i + 1)
                else:
                    raise TypeError('sequences should be a list of strings, '
                                    'Atomic, or Sequence instances')
                strseq = strseq + '-' * (max_len - len(strseq))
                msa.append(array(list(strseq)))
                fetched_labels.append(label)
            sequences = array(msa)
        except:
            raise TypeError('sequences should be iterable')

        # "if a list" is a pythonic way to check if a list is empty or not (or none)
        if not labels and fetched_labels:
            labels = fetched_labels

        label = [label.replace(' ', '_') for label in labels]
        # labels checkers are removed because they will be properly handled in MSA class initialization
        msa = MSA(msa=sequences, title=title, labels=labels)

        if align and 'clustal' in method:
            filename = writeMSA(title + '.fasta', msa)

    if align:
        # 2. find and run alignment method
        if 'biopython' in method:
            if len(sequences) == 2:
                msa, _, _ = alignTwoSequencesWithBiopython(
                    sequences[0], sequences[1], **kwargs)
            else:
                raise ValueError(
                    "Provide only two sequences or another method. \
                                  Biopython pairwise alignment can only be used \
                                  to build an MSA with two sequences.")
        elif 'clustalw' in method:
            clustalw = which('clustalw')
            if clustalw is None:
                if which('clustalw2') is not None:
                    clustalw = which('clustalw2')
                else:
                    raise EnvironmentError(
                        "The executable for clustalw was not found, \
                                            install clustalw or add it to the path."
                    )

            os.system('"%s" %s -OUTORDER=INPUT' % (clustalw, filename))

            # 3. parse and return the new MSA
            msa = parseMSA(title + '.aln')

        else:
            alignTool = which(method)
            if alignTool is None:
                raise EnvironmentError("The executable for {0} was not found, \
                                        install it or add it to the path.".
                                       format(alignTool))

            os.system('"%s" %s -OUTORDER=INPUT' % (clustalw, filename))

            # 3. parse and return the new MSA
            msa = parseMSA(title + '.aln')

    return msa
Ejemplo n.º 5
0
def alignSequencesByChain(PDBs, **kwargs):
    """
    Runs :func:`buildMSA` for each chain and optionally joins the results.
    Returns either a single :class:`MSA` or a dictionary containing an :class:`MSA` for each chain.

    :arg PDBs: a list of :class:`AtomGroup` objects
    :type PDBs: list

    :arg join_chains: whether to join chain alignments
        default is True
    :type join_chains: bool 

    :arg join_char: a character for joining chain alignments
        default is '/' as used by PIR format alignments
    :type join_char: str
    """

    if isscalar(PDBs):
        raise TypeError('PDBs should be array-like')

    if not PDBs:
        raise ValueError('PDBs should not be empty')

    pdbs = []
    chains = []
    for i, pdb in enumerate(PDBs):
        if isinstance(pdb, Atomic):
            pdbs.append(pdb)
        else:
            raise TypeError(
                'each entry in PDBs must be a :class:`Atomic` instance')

        chains.append([])
        for chain in list(pdbs[i].getHierView()):
            chains[i].append(chain)

        if i != 0 and len(chains[i]) != len(chains[0]):
            raise ValueError('all pdbs should have the same number of chains')

    labels = []
    for pdb in pdbs:
        chids = ''
        for chain in list(pdb.getHierView()):
            chids += chain.getChid()
        labels.append(pdb.getTitle() + '_' + chids)

    chains = array(chains)
    chain_alignments = []
    alignments = {}
    for j in range(len(chains[0])):
        prefix = 'chain_' + chains[0, j].getChid()
        msa = buildMSA(chains[:, j], title=prefix, labels=labels)
        msa = refineMSA(msa, colocc=1e-9)  # remove gap-only cols

        chain_alignments.append(msa)
        alignments[labels[0].split('_')[1][j]] = msa

    join_chains = kwargs.get('join_chains', True)
    join_char = kwargs.get('join_char', '/')

    if len(chains[0]) == 1:
        join_chains = False

    if join_chains:
        joined_msaarr = []
        for i, chain_alignment in enumerate(chain_alignments):
            pdb_seqs = []
            for j, sequence in enumerate(chain_alignment):
                pdb_seqs.append(sequence)
            joined_msaarr.append(join_char.join(pdb_seqs))

        result = MSA(joined_msaarr,
                     title='joined_chains',
                     labels=[label.split('_')[0] for label in labels])

    else:
        result = alignments
        if len(result) == 1:
            result = result[list(result.keys())[0]]

    return result
Ejemplo n.º 6
0
def alignSequenceToMSA(seq, msa, **kwargs):
    """
    Align a sequence from a PDB or Sequence to a sequence from an MSA
    and create two sets of indices. 

    The sequence from the MSA (*seq*), the alignment and 
    the two sets of indices are returned. 
    
    The first set (*indices*) maps the residue numbers in the PDB to 
    the reference sequence. The second set (*msa_indices*) indexes the 
    reference sequence in the msa and is used for retrieving values 
    from the first indices.

    :arg seq: an object with an associated sequence string 
         or a sequence string itself
    :type seq: :class:`.Atomic`, :class:`.Sequence`, str
    
    :arg msa: a multiple sequence alignment
    :type msa: :class:`.MSA`
    
    :arg label: a label for a sequence in msa or a PDB ID
        ``msa.getIndex(label)`` must return a sequence index
    :type label: str
    
    :arg chain: which chain from pdb to use for alignment, default is **None**, 
        which does no selection on *seq*. This value will be ignored if seq is 
        not an :class:`.Atomic` object.
    :type chain: str
    
    Parameters for Biopython ``pairwise2`` alignments can be provided as 
    keyword arguments. Default values are originally from ``proteins.compare`` 
    module, but now found in ``utilities.seqtools``.

    :arg match: a positive integer, used to reward finding a match
    :type match: int
    
    :arg mismatch: a negative integer, used to penalise finding a mismatch
    :type mismatch: int
    
    :arg gap_opening: a negative integer, used to penalise opening a gap
    :type gap_opening: int
    
    :arg gap_extension: a negative integer, used to penalise extending a gap
    :type gap_extension: int

    :arg method: method for pairwise2 alignment. 
        Possible values are ``"local"`` and ``"global"``
    :type method: str
    """
    label = kwargs.get('label', None)
    chain = kwargs.get('chain', None)

    match = kwargs.get('match', MATCH_SCORE)
    mismatch = kwargs.get('mismatch', MISMATCH_SCORE)
    gap_opening = kwargs.get('gap_opening', GAP_PENALTY)
    gap_extension = kwargs.get('gap_extension', GAP_EXT_PENALTY)
    method = kwargs.get('method', ALIGNMENT_METHOD)

    if isinstance(seq, Atomic):
        if isinstance(chain, str):
            ag = seq.select('chain {0}'.format(chain))
        elif chain is None:
            ag = seq

            chids = ag.getChids()
            if len(unique(chids)) > 1:
                LOGGER.warn('%s consists of multiple chains. Please consider selecting one chain'%(seq.getTitle()))
        else:
            raise TypeError('chain should be a string or None')
        
        if ag is None:
            raise ValueError('seq may be None or chain ID may be invalid')
        sequence = ag.select('ca').getSequence()

    elif isinstance(seq, Sequence):
         sequence = str(seq)
         ag = None
    elif isinstance(seq, str):
        sequence = seq
        ag = None
    else:
        raise TypeError('seq must be an atomic class, sequence class, or str not {0}'
                        .format(type(seq)))

    if not isinstance(msa, MSA):
        raise TypeError('msa must be an MSA instance')

    if label is None:
        if ag:
            label = ag.getTitle().split('_')[0]
        elif isinstance(seq, Sequence):
            label = seq.getLabel()
        else:
            raise ValueError('A label cannot be extracted from seq so please provide one.')

    index = msa.getIndex(label)

    if index is None and (len(label) == 4 or len(label) == 5):
        from prody import parsePDB
        try:
            structure, header = parsePDB(label[:4], header=True)
        except Exception as err:
            raise IOError('failed to parse header for {0} ({1})'
                            .format(label[:4], str(err)))

        chid = chain
        for poly in header['polymers']:
            if chid and poly.chid != chid:
                continue
            for dbref in poly.dbrefs:
                if index is None:
                    index = msa.getIndex(dbref.idcode)
                    if index is not None:
                        LOGGER.info('{0} idcode {1} for {2}{3} '
                                    'is found in {4}.'.format(
                                    dbref.database, dbref.idcode,
                                    label[:4], poly.chid, str(msa)))
                        label = dbref.idcode
                        break
                if index is None:
                    index = msa.getIndex(dbref.accession)
                    if index is not None:
                        LOGGER.info('{0} accession {1} for {2}{3} '
                                    'is found in {4}.'.format(
                                    dbref.database, dbref.accession,
                                    label[:4], poly.chid, str(msa)))
                        label = dbref.accession
                        break
        if index is not None:
            chain = structure[poly.chid]

    if index is None:
        raise ValueError('label is not in msa, or msa is not indexed')
    try:
        len(index)
    except TypeError:
        pass
    else:
        raise ValueError('label {0} maps onto multiple sequences, '
                            'so cannot be used for refinement'.format(label))

    if isinstance(index, int):
        refMsaSeq = str(msa[index]).upper().replace('-','.')
    else:
        raise TypeError('The output from querying that label against msa is not a single sequence.')
    
    if method == 'local':
        alignment = pairwise2.align.localms(sequence, str(refMsaSeq),
                                            match, mismatch, gap_opening, gap_extension,
                                            one_alignment_only=1)
    elif method == 'global':
        alignment = pairwise2.align.globalms(sequence, str(refMsaSeq),
                                       match, mismatch, gap_opening, gap_extension,
                                       one_alignment_only=1)
    else:
        raise ValueError('method should be local or global')

    seq_indices = [0]
    msa_indices = [0]

    for i in range(len(alignment[0][0])):
        if alignment[0][0][i] != '-':
            seq_indices.append(seq_indices[i]+1)
        else:
            seq_indices.append(seq_indices[i])

        if alignment[0][1][i] != '-':
            msa_indices.append(msa_indices[i]+1)
        else:
            msa_indices.append(msa_indices[i])

    seq_indices.pop(0) # The first element was extra for initialisation
    msa_indices.pop(0) # The first element was extra for initialisation

    seq_indices = array(seq_indices)
    msa_indices = array(msa_indices)

    if ag:
        seq_indices += ag.getResnums()[0] - 1

    alignment = MSA(msa=array([array(list(alignment[0][0])), \
                               array(list(alignment[0][1]))]), \
                    labels=[ag.getTitle(), label])

    return alignment, seq_indices, msa_indices
Ejemplo n.º 7
0
def buildMSA(sequences, title='Unknown', labels=None, **kwargs):
    """
    Aligns sequences with clustalw or clustalw2 and returns the resulting MSA.

    :arg sequences: a file, MSA object or a list or array containing sequences
       as Atomic objects with :func:`getSequence` or Sequence objects or strings. 
       If strings are used then labels must be provided using ``labels``
    :type sequences: :class:`Atomic`, :class:`.MSA`, 
        :class:`~numpy.ndarray`, str

    :arg title: the title for the MSA and it will be used as the prefix for output files.
    :type title: str

    :arg labels: a list of labels to go with the sequences
    :type labels: list

    :arg align: whether to do alignment with clustalw(2)
        default True
    :type align: bool
    """

    align = kwargs.get('align', True)
    # 1. check if sequences are in a fasta file and if not make one
    if isinstance(sequences, str):
        filename = sequences
    elif not isinstance(sequences, MSA):
        try:
            max_len = 0
            for sequence in sequences:
                if len(sequence) > max_len:
                    max_len = len(sequence)

            msa = []
            fetched_labels = []
            for i, sequence in enumerate(sequences):
                if isinstance(sequence, Atomic):
                    strseq = sequence.getSequence()
                    label = sequence.getTitle()
                elif isinstance(sequence, Sequence):
                    strseq = str(sequence)
                    label = sequence.getLabel()
                elif isinstance(sequence, str):
                    strseq = sequence
                    label = str(i + 1)
                else:
                    raise TypeError('sequences should be a list of strings, '
                                    'Atomic, or Sequence instances')
                strseq = strseq + '-' * (max_len - len(strseq))
                msa.append(array(list(strseq)))
                fetched_labels.append(label)
            sequences = array(msa)
        except:
            raise TypeError('sequences should be iterable')

        # "if a list" is a pythonic way to check if a list is empty or not (or none)
        if not labels and fetched_labels:
            labels = fetched_labels
        # labels checkers are removed because they will be properly handled in MSA class initialization
        msa = MSA(msa=sequences, title=title, labels=labels)

        if align:
            filename = writeMSA(title + '.fasta', msa)

    if align:
        # 2. find and run alignment method
        clustalw = which('clustalw')
        if clustalw is None:
            if which('clustalw2') is not None:
                clustalw = which('clustalw2')
            else:
                raise EnvironmentError(
                    "The executable for clustalw was not found, \
                                        install clustalw or add it to the path."
                )

        os.system('"%s" %s' % (clustalw, filename))

        # 3. parse and return the new MSA
        msa = parseMSA(title + '.aln')

    return msa
Ejemplo n.º 8
0
def alignSequencesByChain(PDBs, **kwargs):
    """
    Runs buildMSA for each chain and optionally joins the results.
    Returns either a single MSA or a dictionary containing an MSA for each chain.

    :arg PDBs: a list or array of :class:`AtomGroup` objects or PDB IDs
        a mixed list containing both is acceptable
    :type PDBs: list or :class:`~numpy.ndarray`

    :arg join_chains: whether to join chain alignments
        default is True
    :type join_chains: bool 

    :arg join_char: a character for joining chain alignments
        default is '/' as used by PIR format alignments
    :type join_char: str
    """
    if not (isinstance(PDBs, list) or isinstance(PDBs, ndarray)):
        raise TypeError('PDBs should be a list or array')

    if PDBs == []:
        raise ValueError('PDBs should not be an empty list')

    pdbs = []
    chains = []
    for i, pdb in enumerate(PDBs):
        if isinstance(pdb, Atomic):
            pdbs.append(pdb)
        else:
            raise TypeError(
                'each entry in PDBs must be a :class:`Atomic` instance')

        chains.append([])
        for chain in list(pdbs[i].getHierView()):
            chains[i].append(chain)

        if i != 0 and len(chains[i]) != len(chains[0]):
            raise ValueError('all pdbs should have the same number of chains')

    labels = []
    for pdb in pdbs:
        chids = ''
        for chain in list(pdb.getHierView()):
            chids += chain.getChid()
        labels.append(pdb.getTitle().split('_')[0] + '_' + chids)

    chains = array(chains)
    chain_alignments = []
    alignments = {}
    labels_lists = []
    for j in range(len(chains[0])):
        prefix = 'chain_' + chains[0, j].getChid()
        msa = buildMSA(chains[:, j], title=prefix, labels=labels)

        # make all alignments have the sequences in the same order as the 0th
        labels_lists.append([])
        for sequence in msa:
            labels_lists[j].append(sequence.getLabel())

        if j > 0:
            msaarr = []
            for label in labels_lists[0]:
                msaarr.append(msa.getArray()[msa.getIndex(label)])

            msaarr = array(msaarr)
            msa = MSA(msaarr,
                      title='reordered_msa_1',
                      labels=list(labels_lists[0]))
            writeMSA(prefix + '.aln', msa)

        chain_alignments.append(msa)

        # after reordering, create the alignments dictionary
        alignments[labels_lists[0][0].split('_')[1][j]] = msa

    join_chains = kwargs.get('join_chains', True)
    join_char = kwargs.get('join_char', '/')
    if join_chains:
        aligned_sequences = list(zeros(shape(chain_alignments)).T)
        for j in range(shape(chain_alignments)[1]):
            aligned_sequences[j] = list(aligned_sequences[j])

        orig_labels = []
        for i, chain_alignment in enumerate(chain_alignments):
            for j, sequence in enumerate(chain_alignment):
                aligned_sequences[j][i] = str(sequence)
                if i == 0:
                    orig_labels.append(sequence.getLabel())

        joined_msaarr = []
        for j in range(shape(chain_alignments)[1]):
            joined_msaarr.append(
                array(list(join_char.join(aligned_sequences[j]))))
        joined_msaarr = array(joined_msaarr)

        result = MSA(joined_msaarr, title='joined_chains', labels=orig_labels)
        result = refineMSA(result, colocc=1e-9)  # remove gap-only cols

    else:
        result = alignments

    return result
Ejemplo n.º 9
0
def alignSequencesByChain(PDBs, **kwargs):
    """
    Runs :func:`buildMSA` for each chain and optionally joins the results.
    Returns either a single :class:`MSA` or a dictionary containing an :class:`MSA` for each chain.

    :arg PDBs: a list of :class:`AtomGroup` objects
    :type PDBs: list

    :arg join_chains: whether to join chain alignments
        default is True
    :type join_chains: bool 

    :arg join_char: a character for joining chain alignments
        default is '/' as used by PIR format alignments
    :type join_char: str
    """
    
    if isscalar(PDBs):
        raise TypeError('PDBs should be array-like')

    if not PDBs:
        raise ValueError('PDBs should not be empty')

    pdbs = []
    chains = []
    for i, pdb in enumerate(PDBs):
        if isinstance(pdb, Atomic):
            pdbs.append(pdb)
        else:
            raise TypeError('each entry in PDBs must be a :class:`Atomic` instance')

        chains.append([])
        for chain in list(pdbs[i].getHierView()):
            chains[i].append(chain)

        if i != 0 and len(chains[i]) != len(chains[0]):
            raise ValueError('all pdbs should have the same number of chains')

    labels = []
    for pdb in pdbs:
        chids = ''
        for chain in list(pdb.getHierView()):
            chids += chain.getChid()
        labels.append(pdb.getTitle() + '_' + chids)

    chains = array(chains)
    chain_alignments = []
    alignments = {}
    for j in range(len(chains[0])):
        prefix = 'chain_' + chains[0, j].getChid()
        msa = buildMSA(chains[:, j], title=prefix, labels=labels)
        msa = refineMSA(msa, colocc=1e-9) # remove gap-only cols
        
        chain_alignments.append(msa)
        alignments[labels[0].split('_')[1][j]] = msa

    join_chains = kwargs.get('join_chains', True)
    join_char = kwargs.get('join_char', '/')

    if len(chains[0]) == 1:
        join_chains = False

    if join_chains:
        joined_msaarr = []
        for i, chain_alignment in enumerate(chain_alignments):
            pdb_seqs = []
            for j, sequence in enumerate(chain_alignment):
                pdb_seqs.append(sequence)
            joined_msaarr.append(join_char.join(pdb_seqs))
        
        result = MSA(joined_msaarr, title='joined_chains', 
                     labels=[label.split('_')[0] for label in labels])

    else:
        result = alignments
        if len(result) == 1:
            result = result[list(result.keys())[0]]
            
    return result