def _translate_gapped(seq, *args, **kwds): if isinstance(seq, SeqRecord): s = str(seq.seq) elif isinstance(seq, Seq): s = str(seq) elif isinstance(seq, str): s = seq else: msg = "can only translate sequences of type SeqRecord, Seq, or str" raise ValueError(msg) while len(s) % 3 != 0: s += 'N' gaps = 0 lwr = 0 protein = '' for i in range(0, len(s), 3): j = min(i + 3, len(s)) if s[i:j] == '---'[:j - i]: if not gaps: protein += _translate(s[lwr:i].replace('-', 'N')) gaps += 1 elif gaps: protein += '-' * gaps gaps = 0 lwr = i if gaps: protein += '-' * gaps else: protein += _translate(s[lwr:len(s)].replace('-', 'N')) return protein
def _protein_to_codon(protein_matrix, non_identity_penalty=None): from BioExt.scorematrices._scorematrix import dletters codon_matrix = np.ones((64, 64), dtype=float) * -1e4 pletters = protein_matrix.letters mapping = defaultdict(list) stops = set() for i in range(4): for j in range(4): for k in range(4): cdn = ''.join(dletters[l] for l in (i, j, k)) aa = _translate(cdn) idx = pletters.index(aa) if aa == '*': stops.add(idx) mapping[idx].append(16 * i + 4 * j + k) protein_matrix_ = protein_matrix.tondarray() M, N = protein_matrix_.shape for i in range(M): for k in mapping[i]: for j in range(N): for l in mapping[j]: # penalize transitions to stop codons if i != j and (i in stops or j in stops): pass else: codon_matrix[k, l] = protein_matrix_[i, j] if k != l and non_identity_penalty: codon_matrix[k, l] -= non_identity_penalty return dletters, codon_matrix
def translate_ambiguous(seq, gap_char=_GAP, trim_gaps=True): if isinstance(seq, SeqRecord): seqstr = seq.seq.tostring() elif isinstance(seq, Seq): seqstr = seq.tostring() elif not isinstance(seq, str): msg = 'can only enumerate codons of a SeqRecord, Seq, or str' raise ValueError(msg) if trim_gaps: seqstr = seqstr.replace(gap_char, '') seqstr = seqstr.upper() aminos = [] gap_cdn = 3 * gap_char for _, cdn in enumerate_by_codon(seqstr, gap_char): # if we're not trimming gaps, # convert gap codons into single codons if cdn == gap_cdn: aminos.append(set('-')) continue # otherwise, combinatorial fun nucs = [] for nuc in cdn: if nuc in _NUC_AMBIGS: nucs.append(_NUC_AMBIGS[nuc]) else: nucs.append(nuc) aminos.append(set(_translate(''.join(p)) for p in product(*nucs))) return AmbigList(aminos)
def __init__(self, seq, prior=0): if isinstance(seq, SeqRecord): seq = str(seq.seq) elif isinstance(seq, Seq): seq = str(seq) elif not isinstance(seq, str): raise ValueError('seq must be of type SeqRecord, Seq, or str') table = _default_table(0) for i in range(0, len(seq), 3): j = i + 3 if j > len(seq): continue cdn = seq[i:j].upper() aa = _translate(cdn) # skip unknown codons, they are irrelevant if aa == 'X': continue if cdn not in table[aa]: raise ValueError("sequence uses malformed alphabet '%s'" % cdn) table[aa][cdn] += 1 for aa, cdns in table.items(): total = prior * len(cdns) + sum(cdns.values()) unif = 1. / len(cdns) cdf = [] acc = 0. for cdn, count in sorted(cdns.items(), key=itemgetter(1)): if total: pdf = (count + prior) / total else: pdf = unif acc += pdf cdf.append((acc, cdn)) table[aa] = cdf self.__table = table
def _default_table(prior=0): table = {} for i in 'ACGT': for j in 'ACGT': for k in 'ACGT': cdn = ''.join((i, j, k)) aa = _translate(cdn) if aa not in table: table[aa] = Counter() table[aa].update({cdn: prior}) # default the unkown amino acid to NNN in all cases table['X'] = Counter({'NNN': 1}) return table
def __contains__(self, other): if other.lower() in str(self.seq).lower(): return True else: s = self.seq.watson.replace(" ", "") ln = len(s) spc = 3 - ln % 3 if ln % 3 else 0 s = "n" * spc + s + "nnn" for frame in range(3): if other.lower() in _translate(s[frame:frame + spc + ln]).lower(): return True return False
def find_aminoacids(self, other): """ >>> from pydna.dseqrecord import Dseqrecord >>> s=Dseqrecord("atgtacgatcgtatgctggttatattttag") >>> s.seq.translate() Seq('MYDRMLVIF*') >>> "RML" in s True >>> "MMM" in s False >>> s.seq.rc().translate() Seq('LKYNQHTIVH') >>> "QHT" in s.rc() True >>> "QHT" in s False >>> slc = s.find_aa("RML") >>> slc slice(9, 18, None) >>> s[slc] Dseqrecord(-9) >>> code = s[slc].seq >>> code Dseq(-9) cgtatgctg gcatacgac >>> code.translate() Seq('RML') """ other = str(other).lower() assert self.seq.watson == "".join(self.seq.watson.split()) s = self.seq.watson ln = len(s) spc = 3 - ln % 3 if ln % 3 else 0 s = s + "n" * spc + "nnn" start = None for frame in range(3): try: start = _translate(s[frame:frame + ln + spc]).lower().index(other) break except ValueError: pass oh = self.seq.ovhg if self.seq.ovhg > 0 else 0 if start == None: return None # TODO return an emoty slice or False...? else: return slice(frame + start * 3 + oh, frame + (start + len(other)) * 3 + oh)
def __call__( self, ref, query, open_insertion=None, extend_insertion=None, open_deletion=None, extend_deletion=None, miscall_cost=None, do_local=None, do_affine=None ): # populate defaults from initialization if open_insertion is None: open_insertion = self.__open_insertion if extend_insertion is None: extend_insertion = self.__extend_insertion if open_deletion is None: open_deletion = self.__open_deletion if extend_deletion is None: extend_deletion = self.__extend_deletion if miscall_cost is None: miscall_cost = self.__miscall_cost if do_local is None: do_local = self.__do_local if do_affine is None: do_affine = self.__do_affine ref = gapless(ref) query = gapless(query) # if the reference and query are the same, we can return early if len(ref) and ref == query: if self.__do_codon: score = sum(self.__score_matrix[char, char] for char in _translate(ref)) else: score = sum(self.__score_matrix[char, char] for char in ref) return score / len(ref), ref, query if isinstance(ref, SeqRecord): ref_ = str(ref.seq) elif isinstance(ref, Seq): ref_ = str(ref) else: ref_ = ref if isinstance(query, SeqRecord): query_ = str(query.seq) elif isinstance(query, Seq): query_ = str(query) else: query_ = query # convert to uppercase, because _align assumes it ref_ = ref_.upper() query_ = query_.upper() if self.__do_codon and len(ref_) % 3 != 0: raise ValueError('when do_codon = True, len(ref) must be a multiple of 3') # if do_codon, the query's length needs to be a multiple of 3 # if self.__do_codon and len(query_) % 3 != 0: # ns = 3 - len(query_) % 3 # query_ += 'N' * ns # else: # ns = 0 # for shared memory safety, recreate matrices if the PID changed current_pid = getpid() if self.__cached_pid != current_pid: self.__cached_pid = current_pid self.__cached_score_matrix = np.empty((1,), dtype=float) self.__cached_deletion_matrix = np.empty((1,), dtype=float) self.__cached_insertion_matrix = np.empty((1,), dtype=float) if self.__do_codon: cache_size = (len(ref_) // 3 + 1) * (len(query_) + 1) else: cache_size = (len(ref_) + 1) * (len(query_) + 1) if self.__cached_score_matrix.shape[0] < cache_size: self.__cached_score_matrix.resize((cache_size,)) if do_affine: if self.__cached_deletion_matrix.shape[0] < cache_size: self.__cached_deletion_matrix.resize((cache_size,)) if self.__cached_insertion_matrix.shape[0] < cache_size: self.__cached_insertion_matrix.resize((cache_size,)) if len(query) == 0: score, ref_aligned, query_aligned = float('-Inf'), ref_, '-' * len(ref_) else: score, ref_aligned, query_aligned = _align( ref_.encode('utf-8'), query_.encode('utf-8'), self.__nchars, self.__char_map, self.__score_matrix_, self.__score_matrix_.shape[0], open_insertion, extend_insertion, open_deletion, extend_deletion, miscall_cost, do_local, do_affine, self.__do_codon, self.__codon3x5, self.__codon3x4, self.__codon3x2, self.__codon3x1, self.__cached_score_matrix, self.__cached_deletion_matrix, self.__cached_insertion_matrix ) if sys.version_info >= (3, 0): ref_aligned = ref_aligned.decode('utf-8') query_aligned = query_aligned.decode('utf-8') if isinstance(ref, SeqRecord): ref_aligned_ = SeqRecord( Seq(ref_aligned, ref.seq.alphabet), id=ref.id, name=ref.name, description=ref.description, dbxrefs=ref.dbxrefs, annotations=ref.annotations ) elif isinstance(ref, Seq): ref_aligned_ = Seq(ref_aligned, ref.seq.alphabet) else: ref_aligned_ = ref_aligned if isinstance(query, SeqRecord): query_aligned_ = SeqRecord( Seq(query_aligned, query.seq.alphabet), id=query.id, name=query.name, description=query.description, dbxrefs=query.dbxrefs, annotations=query.annotations ) elif isinstance(query, Seq): query_aligned_ = Seq(query_aligned, query.seq.alphabet) else: query_aligned_ = query_aligned # normalize score to per-position if len(query_): score /= (len(query_) / 3) if self.__do_codon else len(query_) return score, ref_aligned_, query_aligned_
def __call__(self, ref, query, open_insertion=None, extend_insertion=None, open_deletion=None, extend_deletion=None, miscall_cost=None, do_local=None, do_affine=None): # populate defaults from initialization if open_insertion is None: open_insertion = self.__open_insertion if extend_insertion is None: extend_insertion = self.__extend_insertion if open_deletion is None: open_deletion = self.__open_deletion if extend_deletion is None: extend_deletion = self.__extend_deletion if miscall_cost is None: miscall_cost = self.__miscall_cost if do_local is None: do_local = self.__do_local if do_affine is None: do_affine = self.__do_affine ref = gapless(ref) query = gapless(query) # if the reference and query are the same, we can return early if len(ref) and ref == query: if self.__do_codon: score = sum(self.__score_matrix[char, char] for char in _translate(ref)) else: score = sum(self.__score_matrix[char, char] for char in ref) return score / len(ref), ref, query if isinstance(ref, SeqRecord): ref_ = str(ref.seq) elif isinstance(ref, Seq): ref_ = str(ref) else: ref_ = ref if isinstance(query, SeqRecord): query_ = str(query.seq) elif isinstance(query, Seq): query_ = str(query) else: query_ = query # convert to uppercase, because _align assumes it ref_ = ref_.upper() query_ = query_.upper() if self.__do_codon and len(ref_) % 3 != 0: raise ValueError( 'when do_codon = True, len(ref) must be a multiple of 3') # if do_codon, the query's length needs to be a multiple of 3 # if self.__do_codon and len(query_) % 3 != 0: # ns = 3 - len(query_) % 3 # query_ += 'N' * ns # else: # ns = 0 # for shared memory safety, recreate matrices if the PID changed current_pid = getpid() if self.__cached_pid != current_pid: self.__cached_pid = current_pid self.__cached_score_matrix = np.empty((1, ), dtype=float) self.__cached_deletion_matrix = np.empty((1, ), dtype=float) self.__cached_insertion_matrix = np.empty((1, ), dtype=float) if self.__do_codon: cache_size = (len(ref_) // 3 + 1) * (len(query_) + 1) else: cache_size = (len(ref_) + 1) * (len(query_) + 1) if self.__cached_score_matrix.shape[0] < cache_size: self.__cached_score_matrix.resize((cache_size, )) if do_affine: if self.__cached_deletion_matrix.shape[0] < cache_size: self.__cached_deletion_matrix.resize((cache_size, )) if self.__cached_insertion_matrix.shape[0] < cache_size: self.__cached_insertion_matrix.resize((cache_size, )) if len(query) == 0: score, ref_aligned, query_aligned = float( '-Inf'), ref_, '-' * len(ref_) else: score, ref_aligned, query_aligned = _align( ref_.encode('utf-8'), query_.encode('utf-8'), self.__nchars, self.__char_map, self.__score_matrix_, self.__score_matrix_.shape[0], open_insertion, extend_insertion, open_deletion, extend_deletion, miscall_cost, do_local, do_affine, self.__do_codon, self.__codon3x5, self.__codon3x4, self.__codon3x2, self.__codon3x1, self.__cached_score_matrix, self.__cached_deletion_matrix, self.__cached_insertion_matrix) if sys.version_info >= (3, 0): ref_aligned = ref_aligned.decode('utf-8') query_aligned = query_aligned.decode('utf-8') if isinstance(ref, SeqRecord): ref_aligned_ = SeqRecord(Seq(ref_aligned, ref.seq.alphabet), id=ref.id, name=ref.name, description=ref.description, dbxrefs=ref.dbxrefs, annotations=ref.annotations) elif isinstance(ref, Seq): ref_aligned_ = Seq(ref_aligned, ref.seq.alphabet) else: ref_aligned_ = ref_aligned if isinstance(query, SeqRecord): query_aligned_ = SeqRecord(Seq(query_aligned, query.seq.alphabet), id=query.id, name=query.name, description=query.description, dbxrefs=query.dbxrefs, annotations=query.annotations) elif isinstance(query, Seq): query_aligned_ = Seq(query_aligned, query.seq.alphabet) else: query_aligned_ = query_aligned # normalize score to per-position if len(query_): score /= (len(query_) / 3) if self.__do_codon else len(query_) return score, ref_aligned_, query_aligned_