def p_distance(seq1, seq2): '''Modified hamming distance to include only non-gap sites''' from skbio.sequence import Sequence from numpy import isnan myseq1 = str(seq1) myseq2 = str(seq2) degapped1 = [] degapped2 = [] for i in range(len(myseq1)): if myseq1[i] != "-": if myseq2[i] != "-": degapped1.append(myseq1[i]) degapped2.append(myseq2[i]) degapped1 = "".join(degapped1) degapped2 = "".join(degapped2) #sys.stderr.write(degapped1) #sys.stderr.write(degapped2) hamming_dist = hamming(Sequence(degapped1), Sequence(degapped2)) #sys.stderr.write(hamming_dist) if isnan(hamming_dist): #sys.stderr.write(seq1.metadata["id"], seq2.metadata["id"]) return 0.0 else: return hamming_dist
def _coerce_alignment_input_type(seq, disallow_alignment): """ Converts variety of types into an skbio.Alignment object """ if isinstance(seq, string_types): return Alignment([Sequence(seq, metadata={'id': ''})]) elif isinstance(seq, Sequence): if 'id' in seq.metadata: return Alignment([seq]) else: seq = seq.copy() seq.metadata['id'] = '' return Alignment([seq]) elif isinstance(seq, Alignment): if disallow_alignment: # This will disallow aligning either a pair of alignments, or an # alignment and a sequence. We don't currently support this for # local alignment as there is not a clear usecase, and it's also # not exactly clear how this would work. raise TypeError("Aligning alignments is not currently supported " "with the aligner function that you're calling.") else: return seq else: raise TypeError("Unsupported type provided to aligner: %r." % type(seq))
def _clustal_to_alignment(fh, strict=True): r"""yields labels and sequences from msa (multiple sequence alignment) Parameters ---------- fh : open file object An open Clustal file. strict : boolean Whether or not to raise a ``ClustalFormatError`` when no labels are found. Returns ------- skbio.Alignment Alignment object containing aligned biogical sequences Raises ------ skbio.util.exception.ClustalFormatError If the sequences in `fh` don't have the same sequence length or if the sequence ids don't properly match with the subsequences Notes ----- Skips any line that starts with a blank. ``_clustal_to_alignment`` preserves the order of the sequences from the original file. However, it does use a dict as an intermediate, so two sequences can't have the same label. This is probably OK since Clustal will refuse to run on a FASTA file in which two sequences have the same label, but could potentially cause trouble with manually edited files (all the segments of the conflicting sequences would be interleaved, possibly in an unpredictable way). If the lines have trailing numbers (i.e. Clustal was run with `-LINENOS=ON`), silently deletes them. Does not check that the numbers actually correspond to the number of chars in the sequence printed so far. References ---------- .. [1] Thompson JD, Higgins DG, Gibson TJ, "CLUSTAL W: improving the sensitivity of progressive multiple sequence alignment through sequence weighting, position-specific gap penalties and weight matrix choice. Thompson", Nucleic Acids Res. 1994 Nov 11;22(22):4673-80. """ records = map(_delete_trailing_number, filter(_is_clustal_seq_line, fh)) data, labels = _label_line_parser(records, strict) aligned_correctly = _check_length(data, labels) if not aligned_correctly: raise ClustalFormatError("Sequences not aligned properly") alns = [] for key in labels: alns.append(Sequence(sequence=''.join(data[key]), metadata={'id': key})) return Alignment(alns)
def distance_to(self, other): """return the distance between two sequence this method return the distance between two sequence using skbio Sequence class and and distance method. While this method can only compute distance between sequence with the same length we will delete some of the nucleotide of the longest sequence. """ if len(self.sequence) > len(other.sequence): cut = len(self.sequence) - len(other.sequence) self.sequence = self.sequence[0:-cut] elif len(other.sequence) > len(self.sequence): cut = len(other.sequence) - len(self.sequence) other.sequence = other.sequence[0:-cut] else: pass skseq_self = SkSequence(self.sequence) skseq_other = SkSequence(other.sequence) return skseq_self.distance(skseq_other)
def align_sequences(seqs): import io from subprocess import run, PIPE from skbio.io import read, write from skbio.sequence import Sequence fasta = 'rational_designs.fa' seqs = (Sequence(x) for x in seqs) write(seqs, format='fasta', into=fasta) clustalo = 'clustalo', '-i', fasta stdout = run(clustalo, stdout=PIPE, encoding='utf8').stdout stdout_io = io.StringIO(stdout) msa = read(stdout_io, format='fasta') return [str(x) for x in msa]
def _get_position(self, i): seq = Sequence.concat([s[i] for s in self._seqs], how='outer') # TODO: when positional metadata exists, do something like below: # if self.has_positional_metadata(): # seq.metadata = dict(self.positional_metadata.iloc[i]) return seq
def align2skbio(align): return skbio.TabularMSA( [Sequence(s, metadata=dict(id=str(i))) for i, s in align.items()])
def _get_position(self, i): seq = Sequence.concat([s[i] for s in self._seqs], how='outer') if self.has_positional_metadata(): seq.metadata = dict(self.positional_metadata.iloc[i]) return seq
def _traceback(traceback_matrix, score_matrix, aln1, aln2, start_row, start_col, gap_character='-'): # cache some values for simpler aend = _traceback_encoding['alignment-end'] match = _traceback_encoding['match'] vgap = _traceback_encoding['vertical-gap'] hgap = _traceback_encoding['horizontal-gap'] # initialize the result alignments aln1_sequence_count = aln1.sequence_count() aligned_seqs1 = [[] for e in range(aln1_sequence_count)] aln2_sequence_count = aln2.sequence_count() aligned_seqs2 = [[] for e in range(aln2_sequence_count)] current_row = start_row current_col = start_col best_score = score_matrix[current_row, current_col] current_value = None while current_value != aend: current_value = traceback_matrix[current_row, current_col] if current_value == match: for aligned_seq, input_seq in zip(aligned_seqs1, aln1): aligned_seq.append(str(input_seq[current_col - 1])) for aligned_seq, input_seq in zip(aligned_seqs2, aln2): aligned_seq.append(str(input_seq[current_row - 1])) current_row -= 1 current_col -= 1 elif current_value == vgap: for aligned_seq in aligned_seqs1: aligned_seq.append('-') for aligned_seq, input_seq in zip(aligned_seqs2, aln2): aligned_seq.append(str(input_seq[current_row - 1])) current_row -= 1 elif current_value == hgap: for aligned_seq, input_seq in zip(aligned_seqs1, aln1): aligned_seq.append(str(input_seq[current_col - 1])) for aligned_seq in aligned_seqs2: aligned_seq.append('-') current_col -= 1 elif current_value == aend: continue else: raise ValueError("Invalid value in traceback matrix: %s" % current_value) for i in range(aln1_sequence_count): aligned_seq = ''.join(aligned_seqs1[i][::-1]) seq_id = _get_seq_id(aln1[i], str(i)) aligned_seqs1[i] = Sequence(aligned_seq, metadata={'id': seq_id}) for i in range(aln2_sequence_count): aligned_seq = ''.join(aligned_seqs2[i][::-1]) seq_id = _get_seq_id(aln2[i], str(i + aln1_sequence_count)) aligned_seqs2[i] = Sequence(aligned_seq, metadata={'id': seq_id}) return (aligned_seqs1, aligned_seqs2, best_score, current_col, current_row)