def buildMSA(sequences, title='Unknown', labels=None, **kwargs): """ Aligns sequences with clustalw or clustalw2 and returns the resulting MSA. :arg sequences: a file, MSA object or a list or array containing sequences as Atomic objects with :func:`getSequence` or Sequence objects or strings. If strings are used then labels must be provided using ``labels`` :type sequences: :class:`Atomic`, :class:`.MSA`, :class:`~numpy.ndarray`, str :arg title: the title for the MSA and it will be used as the prefix for output files. :type title: str :arg labels: a list of labels to go with the sequences :type labels: list :arg align: whether to do alignment with clustalw(2) default True :type align: bool """ align = kwargs.get('align', True) # 1. check if sequences are in a fasta file and if not make one if isinstance(sequences, str): filename = sequences elif not isinstance(sequences, MSA): try: max_len = 0 for sequence in sequences: if len(sequence) > max_len: max_len = len(sequence) msa = [] fetched_labels = [] for i, sequence in enumerate(sequences): if isinstance(sequence, Atomic): strseq = sequence.getSequence() label = sequence.getTitle() elif isinstance(sequence, Sequence): strseq = str(sequence) label = sequence.getLabel() elif isinstance(sequence, str): strseq = sequence label = str(i + 1) else: raise TypeError('sequences should be a list of strings, ' 'Atomic, or Sequence instances') strseq = strseq + '-' * (max_len - len(strseq)) msa.append(array(list(strseq))) fetched_labels.append(label) sequences = array(msa) except: raise TypeError('sequences should be iterable') # "if a list" is a pythonic way to check if a list is empty or not (or none) if not labels and fetched_labels: labels = fetched_labels # labels checkers are removed because they will be properly handled in MSA class initialization msa = MSA(msa=sequences, title=title, labels=labels) if align: filename = writeMSA(title + '.fasta', msa) if align: # 2. find and run alignment method clustalw = which('clustalw') if clustalw is None: if which('clustalw2') is not None: clustalw = which('clustalw2') else: raise EnvironmentError( "The executable for clustalw was not found, \ install clustalw or add it to the path." ) os.system('"%s" %s' % (clustalw, filename)) # 3. parse and return the new MSA msa = parseMSA(title + '.aln') return msa
def buildMSA(sequences, title='Unknown', labels=None, **kwargs): """ Aligns sequences with clustalw or clustalw2 and returns the resulting MSA. :arg sequences: a file, MSA object or a list or array containing sequences as Atomic objects with :func:`getSequence` or Sequence objects or strings. If strings are used then labels must be provided using ``labels`` :type sequences: :class:`Atomic`, :class:`.MSA`, :class:`~numpy.ndarray`, str :arg title: the title for the MSA and it will be used as the prefix for output files. :type title: str :arg labels: a list of labels to go with the sequences :type labels: list :arg align: whether to align the sequences default True :type align: bool :arg method: alignment method, one of either biopython.align.globalms or clustalw(2). default 'clustalw' :type align: str """ align = kwargs.get('align', True) method = kwargs.pop('method', 'clustalw') # 1. check if sequences are in a fasta file and if not make one if isinstance(sequences, str): filename = sequences elif not isinstance(sequences, MSA): try: max_len = 0 for sequence in sequences: if isinstance(sequence, Atomic): if len(sequence.ca.copy()) > max_len: max_len = len(sequence.ca.copy()) elif isinstance(sequence, MSA): if len(sequence[0]) > max_len: max_len = len(sequence[0]) else: if len(sequence) > max_len: max_len = len(sequence) msa = [] fetched_labels = [] for i, sequence in enumerate(sequences): if isinstance(sequence, Atomic): strseq = sequence.ca.getSequence() label = sequence.getTitle() elif isinstance(sequence, Sequence): strseq = str(sequence) label = sequence.getLabel() elif isinstance(sequence, MSA): strseq = str(sequence[0]) label = sequence.getLabel(0) LOGGER.warn( 'Only the first sequence in the MSA at entry {0} is used.' .format(i)) elif isinstance(sequence, str): strseq = sequence label = str(i + 1) else: raise TypeError('sequences should be a list of strings, ' 'Atomic, or Sequence instances') strseq = strseq + '-' * (max_len - len(strseq)) msa.append(array(list(strseq))) fetched_labels.append(label) sequences = array(msa) except: raise TypeError('sequences should be iterable') # "if a list" is a pythonic way to check if a list is empty or not (or none) if not labels and fetched_labels: labels = fetched_labels label = [label.replace(' ', '_') for label in labels] # labels checkers are removed because they will be properly handled in MSA class initialization msa = MSA(msa=sequences, title=title, labels=labels) if align and 'clustal' in method: filename = writeMSA(title + '.fasta', msa) if align: # 2. find and run alignment method if 'biopython' in method: if len(sequences) == 2: msa, _, _ = alignTwoSequencesWithBiopython( sequences[0], sequences[1], **kwargs) else: raise ValueError( "Provide only two sequences or another method. \ Biopython pairwise alignment can only be used \ to build an MSA with two sequences.") elif 'clustalw' in method: clustalw = which('clustalw') if clustalw is None: if which('clustalw2') is not None: clustalw = which('clustalw2') else: raise EnvironmentError( "The executable for clustalw was not found, \ install clustalw or add it to the path." ) os.system('"%s" %s -OUTORDER=INPUT' % (clustalw, filename)) # 3. parse and return the new MSA msa = parseMSA(title + '.aln') else: alignTool = which(method) if alignTool is None: raise EnvironmentError("The executable for {0} was not found, \ install it or add it to the path.". format(alignTool)) os.system('"%s" %s -OUTORDER=INPUT' % (clustalw, filename)) # 3. parse and return the new MSA msa = parseMSA(title + '.aln') return msa
def buildMSA(sequences, title='Unknown', labels=None, **kwargs): """ Aligns sequences with clustalw or clustalw2 and returns the resulting MSA. :arg sequences: a file, MSA object or a list or array containing sequences as Atomic objects with :func:`getSequence` or Sequence objects or strings. If strings are used then labels must be provided using ``labels`` :type sequences: :class:`Atomic`, :class:`.MSA`, :class:`~numpy.ndarray`, str :arg title: the title for the MSA and it will be used as the prefix for output files. :type title: str :arg labels: a list of labels to go with the sequences :type labels: list :arg align: whether to align the sequences default True :type align: bool :arg method: alignment method, one of either biopython.align.globalms or clustalw(2). default 'clustalw' :type align: str """ align = kwargs.get('align', True) method = kwargs.pop('method', 'clustalw') # 1. check if sequences are in a fasta file and if not make one if isinstance(sequences, str): filename = sequences elif not isinstance(sequences, MSA): try: max_len = 0 for sequence in sequences: if isinstance(sequence, Atomic): if len(sequence.ca.copy()) > max_len: max_len = len(sequence.ca.copy()) elif isinstance(sequence, MSA): if len(sequence[0]) > max_len: max_len = len(sequence[0]) else: if len(sequence) > max_len: max_len = len(sequence) msa = [] fetched_labels = [] for i, sequence in enumerate(sequences): if isinstance(sequence, Atomic): strseq = sequence.ca.getSequence() label = sequence.getTitle() elif isinstance(sequence, Sequence): strseq = str(sequence) label = sequence.getLabel() elif isinstance(sequence, MSA): strseq = str(sequence[0]) label = sequence.getLabel(0) LOGGER.warn('Only the first sequence in the MSA at entry {0} is used.' .format(i)) elif isinstance(sequence, str): strseq = sequence label = str(i + 1) else: raise TypeError('sequences should be a list of strings, ' 'Atomic, or Sequence instances') strseq = strseq + '-'*(max_len - len(strseq)) msa.append(array(list(strseq))) fetched_labels.append(label) sequences = array(msa) except: raise TypeError('sequences should be iterable') # "if a list" is a pythonic way to check if a list is empty or not (or none) if not labels and fetched_labels: labels = fetched_labels label = [label.replace(' ','_') for label in labels] # labels checkers are removed because they will be properly handled in MSA class initialization msa = MSA(msa=sequences, title=title, labels=labels) if align and 'clustal' in method: filename = writeMSA(title + '.fasta', msa) if align: # 2. find and run alignment method if 'biopython' in method: if len(sequences) == 2: msa, _, _ = alignTwoSequencesWithBiopython(sequences[0], sequences[1], **kwargs) else: raise ValueError("Provide only two sequences or another method. \ Biopython pairwise alignment can only be used \ to build an MSA with two sequences.") elif 'clustalw' in method: clustalw = which('clustalw') if clustalw is None: if which('clustalw2') is not None: clustalw = which('clustalw2') else: raise EnvironmentError("The executable for clustalw was not found, \ install clustalw or add it to the path.") os.system('"%s" %s -OUTORDER=INPUT'%(clustalw, filename)) # 3. parse and return the new MSA msa = parseMSA(title + '.aln') else: alignTool = which(method) if alignTool is None: raise EnvironmentError("The executable for {0} was not found, \ install it or add it to the path.".format(alignTool)) os.system('"%s" %s -OUTORDER=INPUT'%(clustalw, filename)) # 3. parse and return the new MSA msa = parseMSA(title + '.aln') return msa