Beispiel #1
0
def buildMSA(sequences, title='Unknown', labels=None, **kwargs):
    """
    Aligns sequences with clustalw or clustalw2 and returns the resulting MSA.

    :arg sequences: a file, MSA object or a list or array containing sequences
       as Atomic objects with :func:`getSequence` or Sequence objects or strings. 
       If strings are used then labels must be provided using ``labels``
    :type sequences: :class:`Atomic`, :class:`.MSA`, 
        :class:`~numpy.ndarray`, str

    :arg title: the title for the MSA and it will be used as the prefix for output files.
    :type title: str

    :arg labels: a list of labels to go with the sequences
    :type labels: list

    :arg align: whether to align the sequences
        default True
    :type align: bool

    :arg method: alignment method, one of either biopython.align.globalms or clustalw(2).
        default 'clustalw'
    :type align: str
    """

    align = kwargs.get('align', True)
    method = kwargs.pop('method', 'clustalw')
    # 1. check if sequences are in a fasta file and if not make one
    if isinstance(sequences, str):
        filename = sequences
    elif not isinstance(sequences, MSA):
        try:
            max_len = 0
            for sequence in sequences:
                if isinstance(sequence, Atomic):
                    if len(sequence.ca.copy()) > max_len:
                        max_len = len(sequence.ca.copy())
                elif isinstance(sequence, MSA):
                    if len(sequence[0]) > max_len:
                        max_len = len(sequence[0])
                else:
                    if len(sequence) > max_len:
                        max_len = len(sequence)

            msa = []
            fetched_labels = []
            for i, sequence in enumerate(sequences):
                if isinstance(sequence, Atomic):
                    strseq = sequence.ca.getSequence()
                    label = sequence.getTitle()
                elif isinstance(sequence, Sequence):
                    strseq = str(sequence)
                    label = sequence.getLabel()
                elif isinstance(sequence, MSA):
                    strseq = str(sequence[0])
                    label = sequence.getLabel(0)
                    LOGGER.warn(
                        'Only the first sequence in the MSA at entry {0} is used.'
                        .format(i))
                elif isinstance(sequence, str):
                    strseq = sequence
                    label = str(i + 1)
                else:
                    raise TypeError('sequences should be a list of strings, '
                                    'Atomic, or Sequence instances')
                strseq = strseq + '-' * (max_len - len(strseq))
                msa.append(array(list(strseq)))
                fetched_labels.append(label)
            sequences = array(msa)
        except:
            raise TypeError('sequences should be iterable')

        # "if a list" is a pythonic way to check if a list is empty or not (or none)
        if not labels and fetched_labels:
            labels = fetched_labels

        label = [label.replace(' ', '_') for label in labels]
        # labels checkers are removed because they will be properly handled in MSA class initialization
        msa = MSA(msa=sequences, title=title, labels=labels)

        if align and 'clustal' in method:
            filename = writeMSA(title + '.fasta', msa)

    if align:
        # 2. find and run alignment method
        if 'biopython' in method:
            if len(sequences) == 2:
                msa, _, _ = alignTwoSequencesWithBiopython(
                    sequences[0], sequences[1], **kwargs)
            else:
                raise ValueError(
                    "Provide only two sequences or another method. \
                                  Biopython pairwise alignment can only be used \
                                  to build an MSA with two sequences.")
        elif 'clustalw' in method:
            clustalw = which('clustalw')
            if clustalw is None:
                if which('clustalw2') is not None:
                    clustalw = which('clustalw2')
                else:
                    raise EnvironmentError(
                        "The executable for clustalw was not found, \
                                            install clustalw or add it to the path."
                    )

            os.system('"%s" %s -OUTORDER=INPUT' % (clustalw, filename))

            # 3. parse and return the new MSA
            msa = parseMSA(title + '.aln')

        else:
            alignTool = which(method)
            if alignTool is None:
                raise EnvironmentError("The executable for {0} was not found, \
                                        install it or add it to the path.".
                                       format(alignTool))

            os.system('"%s" %s -OUTORDER=INPUT' % (clustalw, filename))

            # 3. parse and return the new MSA
            msa = parseMSA(title + '.aln')

    return msa
Beispiel #2
0
def alignSequencesByChain(PDBs, **kwargs):
    """
    Runs buildMSA for each chain and optionally joins the results.
    Returns either a single MSA or a dictionary containing an MSA for each chain.

    :arg PDBs: a list or array of :class:`AtomGroup` objects or PDB IDs
        a mixed list containing both is acceptable
    :type PDBs: list or :class:`~numpy.ndarray`

    :arg join_chains: whether to join chain alignments
        default is True
    :type join_chains: bool 

    :arg join_char: a character for joining chain alignments
        default is '/' as used by PIR format alignments
    :type join_char: str
    """
    if not (isinstance(PDBs, list) or isinstance(PDBs, ndarray)):
        raise TypeError('PDBs should be a list or array')

    if PDBs == []:
        raise ValueError('PDBs should not be an empty list')

    pdbs = []
    chains = []
    for i, pdb in enumerate(PDBs):
        if isinstance(pdb, Atomic):
            pdbs.append(pdb)
        else:
            raise TypeError(
                'each entry in PDBs must be a :class:`Atomic` instance')

        chains.append([])
        for chain in list(pdbs[i].getHierView()):
            chains[i].append(chain)

        if i != 0 and len(chains[i]) != len(chains[0]):
            raise ValueError('all pdbs should have the same number of chains')

    labels = []
    for pdb in pdbs:
        chids = ''
        for chain in list(pdb.getHierView()):
            chids += chain.getChid()
        labels.append(pdb.getTitle().split('_')[0] + '_' + chids)

    chains = array(chains)
    chain_alignments = []
    alignments = {}
    labels_lists = []
    for j in range(len(chains[0])):
        prefix = 'chain_' + chains[0, j].getChid()
        msa = buildMSA(chains[:, j], title=prefix, labels=labels)

        # make all alignments have the sequences in the same order as the 0th
        labels_lists.append([])
        for sequence in msa:
            labels_lists[j].append(sequence.getLabel())

        if j > 0:
            msaarr = []
            for label in labels_lists[0]:
                msaarr.append(msa.getArray()[msa.getIndex(label)])

            msaarr = array(msaarr)
            msa = MSA(msaarr,
                      title='reordered_msa_1',
                      labels=list(labels_lists[0]))
            writeMSA(prefix + '.aln', msa)

        chain_alignments.append(msa)

        # after reordering, create the alignments dictionary
        alignments[labels_lists[0][0].split('_')[1][j]] = msa

    join_chains = kwargs.get('join_chains', True)
    join_char = kwargs.get('join_char', '/')
    if join_chains:
        aligned_sequences = list(zeros(shape(chain_alignments)).T)
        for j in range(shape(chain_alignments)[1]):
            aligned_sequences[j] = list(aligned_sequences[j])

        orig_labels = []
        for i, chain_alignment in enumerate(chain_alignments):
            for j, sequence in enumerate(chain_alignment):
                aligned_sequences[j][i] = str(sequence)
                if i == 0:
                    orig_labels.append(sequence.getLabel())

        joined_msaarr = []
        for j in range(shape(chain_alignments)[1]):
            joined_msaarr.append(
                array(list(join_char.join(aligned_sequences[j]))))
        joined_msaarr = array(joined_msaarr)

        result = MSA(joined_msaarr, title='joined_chains', labels=orig_labels)
        result = refineMSA(result, colocc=1e-9)  # remove gap-only cols

    else:
        result = alignments

    return result
Beispiel #3
0
def buildMSA(sequences, title='Unknown', labels=None, **kwargs):
    """
    Aligns sequences with clustalw or clustalw2 and returns the resulting MSA.

    :arg sequences: a file, MSA object or a list or array containing sequences
       as Atomic objects with :func:`getSequence` or Sequence objects or strings. 
       If strings are used then labels must be provided using ``labels``
    :type sequences: :class:`Atomic`, :class:`.MSA`, 
        :class:`~numpy.ndarray`, str

    :arg title: the title for the MSA and it will be used as the prefix for output files.
    :type title: str

    :arg labels: a list of labels to go with the sequences
    :type labels: list

    :arg align: whether to do alignment with clustalw(2)
        default True
    :type align: bool
    """

    align = kwargs.get('align', True)
    # 1. check if sequences are in a fasta file and if not make one
    if isinstance(sequences, str):
        filename = sequences
    elif not isinstance(sequences, MSA):
        try:
            max_len = 0
            for sequence in sequences:
                if len(sequence) > max_len:
                    max_len = len(sequence)

            msa = []
            fetched_labels = []
            for i, sequence in enumerate(sequences):
                if isinstance(sequence, Atomic):
                    strseq = sequence.getSequence()
                    label = sequence.getTitle()
                elif isinstance(sequence, Sequence):
                    strseq = str(sequence)
                    label = sequence.getLabel()
                elif isinstance(sequence, str):
                    strseq = sequence
                    label = str(i + 1)
                else:
                    raise TypeError('sequences should be a list of strings, '
                                    'Atomic, or Sequence instances')
                strseq = strseq + '-' * (max_len - len(strseq))
                msa.append(array(list(strseq)))
                fetched_labels.append(label)
            sequences = array(msa)
        except:
            raise TypeError('sequences should be iterable')

        # "if a list" is a pythonic way to check if a list is empty or not (or none)
        if not labels and fetched_labels:
            labels = fetched_labels
        # labels checkers are removed because they will be properly handled in MSA class initialization
        msa = MSA(msa=sequences, title=title, labels=labels)

        if align:
            filename = writeMSA(title + '.fasta', msa)

    if align:
        # 2. find and run alignment method
        clustalw = which('clustalw')
        if clustalw is None:
            if which('clustalw2') is not None:
                clustalw = which('clustalw2')
            else:
                raise EnvironmentError(
                    "The executable for clustalw was not found, \
                                        install clustalw or add it to the path."
                )

        os.system('"%s" %s' % (clustalw, filename))

        # 3. parse and return the new MSA
        msa = parseMSA(title + '.aln')

    return msa
Beispiel #4
0
def buildMSA(sequences, title='Unknown', labels=None, **kwargs):
    """
    Aligns sequences with clustalw or clustalw2 and returns the resulting MSA.

    :arg sequences: a file, MSA object or a list or array containing sequences
       as Atomic objects with :func:`getSequence` or Sequence objects or strings. 
       If strings are used then labels must be provided using ``labels``
    :type sequences: :class:`Atomic`, :class:`.MSA`, 
        :class:`~numpy.ndarray`, str

    :arg title: the title for the MSA and it will be used as the prefix for output files.
    :type title: str

    :arg labels: a list of labels to go with the sequences
    :type labels: list

    :arg align: whether to align the sequences
        default True
    :type align: bool

    :arg method: alignment method, one of either biopython.align.globalms or clustalw(2).
        default 'clustalw'
    :type align: str
    """
    
    align = kwargs.get('align', True)
    method = kwargs.pop('method', 'clustalw')
    # 1. check if sequences are in a fasta file and if not make one
    if isinstance(sequences, str):
        filename = sequences
    elif not isinstance(sequences, MSA):
        try:
            max_len = 0
            for sequence in sequences:
                if isinstance(sequence, Atomic):
                    if len(sequence.ca.copy()) > max_len:
                        max_len = len(sequence.ca.copy())
                elif isinstance(sequence, MSA):
                    if len(sequence[0]) > max_len:
                        max_len = len(sequence[0])
                else:
                    if len(sequence) > max_len:
                        max_len = len(sequence)

            msa = []
            fetched_labels = []
            for i, sequence in enumerate(sequences):
                if isinstance(sequence, Atomic):
                    strseq = sequence.ca.getSequence()
                    label = sequence.getTitle()
                elif isinstance(sequence, Sequence):
                    strseq = str(sequence)
                    label = sequence.getLabel()
                elif isinstance(sequence, MSA):
                    strseq = str(sequence[0])
                    label = sequence.getLabel(0)
                    LOGGER.warn('Only the first sequence in the MSA at entry {0} is used.'
                                .format(i))
                elif isinstance(sequence, str):
                    strseq = sequence
                    label = str(i + 1)
                else:
                    raise TypeError('sequences should be a list of strings, '
                                    'Atomic, or Sequence instances')
                strseq = strseq + '-'*(max_len - len(strseq))
                msa.append(array(list(strseq)))
                fetched_labels.append(label)
            sequences = array(msa)
        except:
            raise TypeError('sequences should be iterable')

        # "if a list" is a pythonic way to check if a list is empty or not (or none)
        if not labels and fetched_labels:
            labels = fetched_labels

        label = [label.replace(' ','_') for label in labels]
        # labels checkers are removed because they will be properly handled in MSA class initialization
        msa = MSA(msa=sequences, title=title, labels=labels)

        if align and 'clustal' in method:
            filename = writeMSA(title + '.fasta', msa)

    if align:
        # 2. find and run alignment method
        if 'biopython' in method:
            if len(sequences) == 2:
                msa, _, _ = alignTwoSequencesWithBiopython(sequences[0], sequences[1], **kwargs)
            else:
                raise ValueError("Provide only two sequences or another method. \
                                  Biopython pairwise alignment can only be used \
                                  to build an MSA with two sequences.")
        elif 'clustalw' in method:
            clustalw = which('clustalw')
            if clustalw is None:
                if which('clustalw2') is not None:
                    clustalw = which('clustalw2')
                else:
                    raise EnvironmentError("The executable for clustalw was not found, \
                                            install clustalw or add it to the path.")

            os.system('"%s" %s -OUTORDER=INPUT'%(clustalw, filename))

            # 3. parse and return the new MSA
            msa = parseMSA(title + '.aln')

        else:
            alignTool = which(method)
            if alignTool is None:
                raise EnvironmentError("The executable for {0} was not found, \
                                        install it or add it to the path.".format(alignTool))

            os.system('"%s" %s -OUTORDER=INPUT'%(clustalw, filename))

            # 3. parse and return the new MSA
            msa = parseMSA(title + '.aln')

    return msa