Exemple #1
0
def buildMSA(sequences, title='Unknown', labels=None, **kwargs):
    """
    Aligns sequences with clustalw or clustalw2 and returns the resulting MSA.

    :arg sequences: a file, MSA object or a list or array containing sequences
       as Atomic objects with :func:`getSequence` or Sequence objects or strings. 
       If strings are used then labels must be provided using ``labels``
    :type sequences: :class:`Atomic`, :class:`.MSA`, 
        :class:`~numpy.ndarray`, str

    :arg title: the title for the MSA and it will be used as the prefix for output files.
    :type title: str

    :arg labels: a list of labels to go with the sequences
    :type labels: list

    :arg align: whether to do alignment with clustalw(2)
        default True
    :type align: bool
    """

    align = kwargs.get('align', True)
    # 1. check if sequences are in a fasta file and if not make one
    if isinstance(sequences, str):
        filename = sequences
    elif not isinstance(sequences, MSA):
        try:
            max_len = 0
            for sequence in sequences:
                if len(sequence) > max_len:
                    max_len = len(sequence)

            msa = []
            fetched_labels = []
            for i, sequence in enumerate(sequences):
                if isinstance(sequence, Atomic):
                    strseq = sequence.getSequence()
                    label = sequence.getTitle()
                elif isinstance(sequence, Sequence):
                    strseq = str(sequence)
                    label = sequence.getLabel()
                elif isinstance(sequence, str):
                    strseq = sequence
                    label = str(i + 1)
                else:
                    raise TypeError('sequences should be a list of strings, '
                                    'Atomic, or Sequence instances')
                strseq = strseq + '-' * (max_len - len(strseq))
                msa.append(array(list(strseq)))
                fetched_labels.append(label)
            sequences = array(msa)
        except:
            raise TypeError('sequences should be iterable')

        # "if a list" is a pythonic way to check if a list is empty or not (or none)
        if not labels and fetched_labels:
            labels = fetched_labels
        # labels checkers are removed because they will be properly handled in MSA class initialization
        msa = MSA(msa=sequences, title=title, labels=labels)

        if align:
            filename = writeMSA(title + '.fasta', msa)

    if align:
        # 2. find and run alignment method
        clustalw = which('clustalw')
        if clustalw is None:
            if which('clustalw2') is not None:
                clustalw = which('clustalw2')
            else:
                raise EnvironmentError(
                    "The executable for clustalw was not found, \
                                        install clustalw or add it to the path."
                )

        os.system('"%s" %s' % (clustalw, filename))

        # 3. parse and return the new MSA
        msa = parseMSA(title + '.aln')

    return msa
Exemple #2
0
def buildMSA(sequences, title='Unknown', labels=None, **kwargs):
    """
    Aligns sequences with clustalw or clustalw2 and returns the resulting MSA.

    :arg sequences: a file, MSA object or a list or array containing sequences
       as Atomic objects with :func:`getSequence` or Sequence objects or strings. 
       If strings are used then labels must be provided using ``labels``
    :type sequences: :class:`Atomic`, :class:`.MSA`, 
        :class:`~numpy.ndarray`, str

    :arg title: the title for the MSA and it will be used as the prefix for output files.
    :type title: str

    :arg labels: a list of labels to go with the sequences
    :type labels: list

    :arg align: whether to align the sequences
        default True
    :type align: bool

    :arg method: alignment method, one of either biopython.align.globalms or clustalw(2).
        default 'clustalw'
    :type align: str
    """

    align = kwargs.get('align', True)
    method = kwargs.pop('method', 'clustalw')
    # 1. check if sequences are in a fasta file and if not make one
    if isinstance(sequences, str):
        filename = sequences
    elif not isinstance(sequences, MSA):
        try:
            max_len = 0
            for sequence in sequences:
                if isinstance(sequence, Atomic):
                    if len(sequence.ca.copy()) > max_len:
                        max_len = len(sequence.ca.copy())
                elif isinstance(sequence, MSA):
                    if len(sequence[0]) > max_len:
                        max_len = len(sequence[0])
                else:
                    if len(sequence) > max_len:
                        max_len = len(sequence)

            msa = []
            fetched_labels = []
            for i, sequence in enumerate(sequences):
                if isinstance(sequence, Atomic):
                    strseq = sequence.ca.getSequence()
                    label = sequence.getTitle()
                elif isinstance(sequence, Sequence):
                    strseq = str(sequence)
                    label = sequence.getLabel()
                elif isinstance(sequence, MSA):
                    strseq = str(sequence[0])
                    label = sequence.getLabel(0)
                    LOGGER.warn(
                        'Only the first sequence in the MSA at entry {0} is used.'
                        .format(i))
                elif isinstance(sequence, str):
                    strseq = sequence
                    label = str(i + 1)
                else:
                    raise TypeError('sequences should be a list of strings, '
                                    'Atomic, or Sequence instances')
                strseq = strseq + '-' * (max_len - len(strseq))
                msa.append(array(list(strseq)))
                fetched_labels.append(label)
            sequences = array(msa)
        except:
            raise TypeError('sequences should be iterable')

        # "if a list" is a pythonic way to check if a list is empty or not (or none)
        if not labels and fetched_labels:
            labels = fetched_labels

        label = [label.replace(' ', '_') for label in labels]
        # labels checkers are removed because they will be properly handled in MSA class initialization
        msa = MSA(msa=sequences, title=title, labels=labels)

        if align and 'clustal' in method:
            filename = writeMSA(title + '.fasta', msa)

    if align:
        # 2. find and run alignment method
        if 'biopython' in method:
            if len(sequences) == 2:
                msa, _, _ = alignTwoSequencesWithBiopython(
                    sequences[0], sequences[1], **kwargs)
            else:
                raise ValueError(
                    "Provide only two sequences or another method. \
                                  Biopython pairwise alignment can only be used \
                                  to build an MSA with two sequences.")
        elif 'clustalw' in method:
            clustalw = which('clustalw')
            if clustalw is None:
                if which('clustalw2') is not None:
                    clustalw = which('clustalw2')
                else:
                    raise EnvironmentError(
                        "The executable for clustalw was not found, \
                                            install clustalw or add it to the path."
                    )

            os.system('"%s" %s -OUTORDER=INPUT' % (clustalw, filename))

            # 3. parse and return the new MSA
            msa = parseMSA(title + '.aln')

        else:
            alignTool = which(method)
            if alignTool is None:
                raise EnvironmentError("The executable for {0} was not found, \
                                        install it or add it to the path.".
                                       format(alignTool))

            os.system('"%s" %s -OUTORDER=INPUT' % (clustalw, filename))

            # 3. parse and return the new MSA
            msa = parseMSA(title + '.aln')

    return msa
Exemple #3
0
def buildMSA(sequences, title='Unknown', labels=None, **kwargs):
    """
    Aligns sequences with clustalw or clustalw2 and returns the resulting MSA.

    :arg sequences: a file, MSA object or a list or array containing sequences
       as Atomic objects with :func:`getSequence` or Sequence objects or strings. 
       If strings are used then labels must be provided using ``labels``
    :type sequences: :class:`Atomic`, :class:`.MSA`, 
        :class:`~numpy.ndarray`, str

    :arg title: the title for the MSA and it will be used as the prefix for output files.
    :type title: str

    :arg labels: a list of labels to go with the sequences
    :type labels: list

    :arg align: whether to align the sequences
        default True
    :type align: bool

    :arg method: alignment method, one of either biopython.align.globalms or clustalw(2).
        default 'clustalw'
    :type align: str
    """
    
    align = kwargs.get('align', True)
    method = kwargs.pop('method', 'clustalw')
    # 1. check if sequences are in a fasta file and if not make one
    if isinstance(sequences, str):
        filename = sequences
    elif not isinstance(sequences, MSA):
        try:
            max_len = 0
            for sequence in sequences:
                if isinstance(sequence, Atomic):
                    if len(sequence.ca.copy()) > max_len:
                        max_len = len(sequence.ca.copy())
                elif isinstance(sequence, MSA):
                    if len(sequence[0]) > max_len:
                        max_len = len(sequence[0])
                else:
                    if len(sequence) > max_len:
                        max_len = len(sequence)

            msa = []
            fetched_labels = []
            for i, sequence in enumerate(sequences):
                if isinstance(sequence, Atomic):
                    strseq = sequence.ca.getSequence()
                    label = sequence.getTitle()
                elif isinstance(sequence, Sequence):
                    strseq = str(sequence)
                    label = sequence.getLabel()
                elif isinstance(sequence, MSA):
                    strseq = str(sequence[0])
                    label = sequence.getLabel(0)
                    LOGGER.warn('Only the first sequence in the MSA at entry {0} is used.'
                                .format(i))
                elif isinstance(sequence, str):
                    strseq = sequence
                    label = str(i + 1)
                else:
                    raise TypeError('sequences should be a list of strings, '
                                    'Atomic, or Sequence instances')
                strseq = strseq + '-'*(max_len - len(strseq))
                msa.append(array(list(strseq)))
                fetched_labels.append(label)
            sequences = array(msa)
        except:
            raise TypeError('sequences should be iterable')

        # "if a list" is a pythonic way to check if a list is empty or not (or none)
        if not labels and fetched_labels:
            labels = fetched_labels

        label = [label.replace(' ','_') for label in labels]
        # labels checkers are removed because they will be properly handled in MSA class initialization
        msa = MSA(msa=sequences, title=title, labels=labels)

        if align and 'clustal' in method:
            filename = writeMSA(title + '.fasta', msa)

    if align:
        # 2. find and run alignment method
        if 'biopython' in method:
            if len(sequences) == 2:
                msa, _, _ = alignTwoSequencesWithBiopython(sequences[0], sequences[1], **kwargs)
            else:
                raise ValueError("Provide only two sequences or another method. \
                                  Biopython pairwise alignment can only be used \
                                  to build an MSA with two sequences.")
        elif 'clustalw' in method:
            clustalw = which('clustalw')
            if clustalw is None:
                if which('clustalw2') is not None:
                    clustalw = which('clustalw2')
                else:
                    raise EnvironmentError("The executable for clustalw was not found, \
                                            install clustalw or add it to the path.")

            os.system('"%s" %s -OUTORDER=INPUT'%(clustalw, filename))

            # 3. parse and return the new MSA
            msa = parseMSA(title + '.aln')

        else:
            alignTool = which(method)
            if alignTool is None:
                raise EnvironmentError("The executable for {0} was not found, \
                                        install it or add it to the path.".format(alignTool))

            os.system('"%s" %s -OUTORDER=INPUT'%(clustalw, filename))

            # 3. parse and return the new MSA
            msa = parseMSA(title + '.aln')

    return msa