def _coerce_alignment_input_type(seq, disallow_alignment): """ Converts variety of types into an skbio.Alignment object """ if isinstance(seq, string_types): return Alignment([Sequence(seq, metadata={'id': ''})]) elif isinstance(seq, Sequence): if 'id' in seq.metadata: return Alignment([seq]) else: seq = seq.copy() seq.metadata['id'] = '' return Alignment([seq]) elif isinstance(seq, Alignment): if disallow_alignment: # This will disallow aligning either a pair of alignments, or an # alignment and a sequence. We don't currently support this for # local alignment as there is not a clear usecase, and it's also # not exactly clear how this would work. raise TypeError("Aligning alignments is not currently supported " "with the aligner function that you're calling.") else: return seq else: raise TypeError("Unsupported type provided to aligner: %r." % type(seq))
def _fasta_to_alignment(fh, qual=FileSentinel, constructor=Sequence, **kwargs): return Alignment( list( _fasta_to_generator(fh, qual=qual, constructor=constructor, **kwargs)))
def _clustal_to_alignment(fh, strict=True): r"""yields labels and sequences from msa (multiple sequence alignment) Parameters ---------- fh : open file object An open Clustal file. strict : boolean Whether or not to raise a ``ClustalFormatError`` when no labels are found. Returns ------- skbio.Alignment Alignment object containing aligned biogical sequences Raises ------ skbio.util.exception.ClustalFormatError If the sequences in `fh` don't have the same sequence length or if the sequence ids don't properly match with the subsequences Notes ----- Skips any line that starts with a blank. ``_clustal_to_alignment`` preserves the order of the sequences from the original file. However, it does use a dict as an intermediate, so two sequences can't have the same label. This is probably OK since Clustal will refuse to run on a FASTA file in which two sequences have the same label, but could potentially cause trouble with manually edited files (all the segments of the conflicting sequences would be interleaved, possibly in an unpredictable way). If the lines have trailing numbers (i.e. Clustal was run with `-LINENOS=ON`), silently deletes them. Does not check that the numbers actually correspond to the number of chars in the sequence printed so far. References ---------- .. [1] Thompson JD, Higgins DG, Gibson TJ, "CLUSTAL W: improving the sensitivity of progressive multiple sequence alignment through sequence weighting, position-specific gap penalties and weight matrix choice. Thompson", Nucleic Acids Res. 1994 Nov 11;22(22):4673-80. """ records = map(_delete_trailing_number, filter(_is_clustal_seq_line, fh)) data, labels = _label_line_parser(records, strict) aligned_correctly = _check_length(data, labels) if not aligned_correctly: raise ClustalFormatError("Sequences not aligned properly") alns = [] for key in labels: alns.append(Sequence(sequence=''.join(data[key]), metadata={'id': key})) return Alignment(alns)
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = parse_fasta(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in parse_fasta(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id, seq.replace('.', '-').upper())) template_alignment = Alignment.from_fasta_records(template_alignment, DNASequence, validate=True) # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger, temp_dir=get_qiime_temp_dir()) logger.record(str(self)) for i, seq in enumerate(pynast_failed): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_failed[i] = skb_seq pynast_failed = SequenceCollection(pynast_failed) for i, seq in enumerate(pynast_aligned): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_aligned[i] = skb_seq pynast_aligned = Alignment(pynast_aligned) if failure_path is not None: fail_file = open(failure_path, 'w') fail_file.write(pynast_failed.to_fasta()) fail_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(pynast_aligned.to_fasta()) result_file.close() return None else: return pynast_aligned
def _fastq_to_alignment(fh, variant=None, phred_offset=None, constructor=BiologicalSequence): return Alignment( list( _fastq_to_generator(fh, variant=variant, phred_offset=phred_offset, constructor=constructor)))
def test_call_pynast_test1_alt_min_len(self): """PyNastAligner: returns no result when min_len too high """ aligner = PyNastAligner({ 'template_filepath': self.pynast_test1_template_fp, 'min_len': 1000 }) actual_aln = aligner(self.pynast_test1_input_fp) expected_aln = Alignment([]) self.assertEqual(actual_aln, expected_aln)
def _fastq_to_alignment(fh, variant=None, phred_offset=None, constructor=Sequence, **kwargs): return Alignment( list( _fastq_to_generator(fh, variant=variant, phred_offset=phred_offset, constructor=constructor, **kwargs)))
def global_pairwise_align(seq1, seq2, gap_open_penalty, gap_extend_penalty, substitution_matrix, penalize_terminal_gaps=False): """Globally align a pair of seqs or alignments with Needleman-Wunsch Parameters ---------- seq1 : str, BiologicalSequence, or Alignment The first unaligned sequence(s). seq2 : str, BiologicalSequence, or Alignment The second unaligned sequence(s). gap_open_penalty : int or float Penalty for opening a gap (this is substracted from previous best alignment score, so is typically positive). gap_extend_penalty : int or float Penalty for extending a gap (this is substracted from previous best alignment score, so is typically positive). substitution_matrix: 2D dict (or similar) Lookup for substitution scores (these values are added to the previous best alignment score). penalize_terminal_gaps: bool, optional If True, will continue to penalize gaps even after one sequence has been aligned through its end. This behavior is true Needleman-Wunsch alignment, but results in (biologically irrelevant) artifacts when the sequences being aligned are of different length. This is ``False`` by default, which is very likely to be the behavior you want in all or nearly all cases. Returns ------- skbio.Alignment ``Alignment`` object containing the aligned sequences as well as details about the alignment. See Also -------- local_pairwise_align local_pairwise_align_protein local_pairwise_align_nucleotide skbio.alignment.local_pairwise_align_ssw global_pairwise_align_protein global_pairwise_align_nucelotide Notes ----- This algorithm (in a slightly more basic form) was originally described in [1]_. The scikit-bio implementation was validated against the EMBOSS needle web server [2]_. This function can be use to align either a pair of sequences, a pair of alignments, or a sequence and an alignment. References ---------- .. [1] A general method applicable to the search for similarities in the amino acid sequence of two proteins. Needleman SB, Wunsch CD. J Mol Biol. 1970 Mar;48(3):443-53. .. [2] http://www.ebi.ac.uk/Tools/psa/emboss_needle/ """ warn("You're using skbio's python implementation of Needleman-Wunsch " "alignment. This is known to be very slow (e.g., thousands of times " "slower than a native C implementation). We'll be adding a faster " "version soon (see https://github.com/biocore/scikit-bio/issues/254 " "to track progress on this).", EfficiencyWarning) seq1 = _coerce_alignment_input_type(seq1, disallow_alignment=False) seq2 = _coerce_alignment_input_type(seq2, disallow_alignment=False) if penalize_terminal_gaps: init_matrices_f = _init_matrices_nw else: init_matrices_f = _init_matrices_nw_no_terminal_gap_penalty score_matrix, traceback_matrix = \ _compute_score_and_traceback_matrices( seq1, seq2, gap_open_penalty, gap_extend_penalty, substitution_matrix, new_alignment_score=-np.inf, init_matrices_f=init_matrices_f, penalize_terminal_gaps=penalize_terminal_gaps) end_row_position = traceback_matrix.shape[0] - 1 end_col_position = traceback_matrix.shape[1] - 1 aligned1, aligned2, score, seq1_start_position, seq2_start_position = \ _traceback(traceback_matrix, score_matrix, seq1, seq2, end_row_position, end_col_position) start_end_positions = [(seq1_start_position, end_col_position-1), (seq2_start_position, end_row_position-1)] return Alignment(aligned1 + aligned2, score=score, start_end_positions=start_end_positions)
def local_pairwise_align(seq1, seq2, gap_open_penalty, gap_extend_penalty, substitution_matrix): """Locally align exactly two seqs with Smith-Waterman Parameters ---------- seq1 : str or BiologicalSequence The first unaligned sequence. seq2 : str or BiologicalSequence The second unaligned sequence. gap_open_penalty : int or float Penalty for opening a gap (this is substracted from previous best alignment score, so is typically positive). gap_extend_penalty : int or float Penalty for extending a gap (this is substracted from previous best alignment score, so is typically positive). substitution_matrix: 2D dict (or similar) Lookup for substitution scores (these values are added to the previous best alignment score). Returns ------- skbio.Alignment ``Alignment`` object containing the aligned sequences as well as details about the alignment. See Also -------- local_pairwise_align_protein local_pairwise_align_nucleotide skbio.alignment.local_pairwise_align_ssw global_pairwise_align global_pairwise_align_protein global_pairwise_align_nucelotide Notes ----- This algorithm was originally described in [1]_. The scikit-bio implementation was validated against the EMBOSS water web server [2]_. References ---------- .. [1] Identification of common molecular subsequences. Smith TF, Waterman MS. J Mol Biol. 1981 Mar 25;147(1):195-7. .. [2] http://www.ebi.ac.uk/Tools/psa/emboss_water/ """ warn("You're using skbio's python implementation of Smith-Waterman " "alignment. This will be very slow (e.g., thousands of times slower) " "than skbio.alignment.local_pairwise_align_ssw.", EfficiencyWarning) seq1 = _coerce_alignment_input_type(seq1, disallow_alignment=True) seq2 = _coerce_alignment_input_type(seq2, disallow_alignment=True) score_matrix, traceback_matrix = _compute_score_and_traceback_matrices( seq1, seq2, gap_open_penalty, gap_extend_penalty, substitution_matrix, new_alignment_score=0.0, init_matrices_f=_init_matrices_sw) end_row_position, end_col_position =\ np.unravel_index(np.argmax(score_matrix), score_matrix.shape) aligned1, aligned2, score, seq1_start_position, seq2_start_position = \ _traceback(traceback_matrix, score_matrix, seq1, seq2, end_row_position, end_col_position) start_end_positions = [(seq1_start_position, end_col_position-1), (seq2_start_position, end_row_position-1)] return Alignment(aligned1 + aligned2, score=score, start_end_positions=start_end_positions)
def local_pairwise_align_ssw(sequence1, sequence2, constructor=Sequence, **kwargs): """Align query and target sequences with Striped Smith-Waterman. Parameters ---------- sequence1 : str or Sequence The first unaligned sequence sequence2 : str or Sequence The second unaligned sequence constructor : Sequence subclass A constructor to use if `protein` is not True. Returns ------- ``skbio.alignment.Alignment`` The resulting alignment as an Alignment object Notes ----- This is a wrapper for the SSW package [1]_. For a complete list of optional keyword-arguments that can be provided, see ``skbio.alignment.StripedSmithWaterman``. The following kwargs will not have any effect: `suppress_sequences` and `zero_index` If an alignment does not meet a provided filter, `None` will be returned. References ---------- .. [1] Zhao, Mengyao, Wan-Ping Lee, Erik P. Garrison, & Gabor T. Marth. "SSW Library: An SIMD Smith-Waterman C/C++ Library for Applications". PLOS ONE (2013). Web. 11 July 2014. http://www.plosone.org/article/info:doi/10.1371/journal.pone.0082138 See Also -------- skbio.alignment.StripedSmithWaterman """ # We need the sequences for `Alignment` to make sense, so don't let the # user suppress them. kwargs['suppress_sequences'] = False kwargs['zero_index'] = True if isinstance(sequence1, Protein): kwargs['protein'] = True query = StripedSmithWaterman(str(sequence1), **kwargs) alignment = query(str(sequence2)) # If there is no cigar, then it has failed a filter. Return None. if not alignment.cigar: return None start_end = None if alignment.query_begin != -1: start_end = [(alignment.query_begin, alignment.query_end), (alignment.target_begin, alignment.target_end_optimal)] if kwargs.get('protein', False): seqs = [ Protein(alignment.aligned_query_sequence, metadata={'id': 'query'}), Protein(alignment.aligned_target_sequence, metadata={'id': 'target'}) ] else: seqs = [ constructor(alignment.aligned_query_sequence, metadata={'id': 'query'}), constructor(alignment.aligned_target_sequence, metadata={'id': 'target'}) ] return Alignment(seqs, score=alignment.optimal_alignment_score, start_end_positions=start_end)
def _phylip_to_alignment(fh, constructor=Sequence): return Alignment([constructor(seq, metadata={'id': ID}) for (seq, ID) in _parse_phylip_raw(fh)])
def _fasta_to_alignment(fh, qual=FileSentinel, constructor=BiologicalSequence): return Alignment( list(_fasta_to_generator(fh, qual=qual, constructor=constructor)))