예제 #1
0
파일: __init__.py 프로젝트: nlhepler/BioExt
def _translate_gapped(seq, *args, **kwds):
    if isinstance(seq, SeqRecord):
        s = str(seq.seq)
    elif isinstance(seq, Seq):
        s = str(seq)
    elif isinstance(seq, str):
        s = seq
    else:
        msg = "can only translate sequences of type SeqRecord, Seq, or str"
        raise ValueError(msg)
    while len(s) % 3 != 0:
        s += 'N'
    gaps = 0
    lwr = 0
    protein = ''
    for i in range(0, len(s), 3):
        j = min(i + 3, len(s))
        if s[i:j] == '---'[:j - i]:
            if not gaps:
                protein += _translate(s[lwr:i].replace('-', 'N'))
            gaps += 1
        elif gaps:
            protein += '-' * gaps
            gaps = 0
            lwr = i
    if gaps:
        protein += '-' * gaps
    else:
        protein += _translate(s[lwr:len(s)].replace('-', 'N'))
    return protein
예제 #2
0
파일: __init__.py 프로젝트: nlhepler/BioExt
def _protein_to_codon(protein_matrix, non_identity_penalty=None):
    from BioExt.scorematrices._scorematrix import dletters
    codon_matrix = np.ones((64, 64), dtype=float) * -1e4
    pletters = protein_matrix.letters
    mapping = defaultdict(list)
    stops = set()
    for i in range(4):
        for j in range(4):
            for k in range(4):
                cdn = ''.join(dletters[l] for l in (i, j, k))
                aa = _translate(cdn)
                idx = pletters.index(aa)
                if aa == '*':
                    stops.add(idx)
                mapping[idx].append(16 * i + 4 * j + k)
    protein_matrix_ = protein_matrix.tondarray()
    M, N = protein_matrix_.shape
    for i in range(M):
        for k in mapping[i]:
            for j in range(N):
                for l in mapping[j]:
                    # penalize transitions to stop codons
                    if i != j and (i in stops or j in stops):
                        pass
                    else:
                        codon_matrix[k, l] = protein_matrix_[i, j]
                        if k != l and non_identity_penalty:
                            codon_matrix[k, l] -= non_identity_penalty
    return dletters, codon_matrix
예제 #3
0
파일: __init__.py 프로젝트: nlhepler/BioExt
def translate_ambiguous(seq, gap_char=_GAP, trim_gaps=True):
    if isinstance(seq, SeqRecord):
        seqstr = seq.seq.tostring()
    elif isinstance(seq, Seq):
        seqstr = seq.tostring()
    elif not isinstance(seq, str):
        msg = 'can only enumerate codons of a SeqRecord, Seq, or str'
        raise ValueError(msg)

    if trim_gaps:
        seqstr = seqstr.replace(gap_char, '')
    seqstr = seqstr.upper()

    aminos = []
    gap_cdn = 3 * gap_char
    for _, cdn in enumerate_by_codon(seqstr, gap_char):
        # if we're not trimming gaps,
        # convert gap codons into single codons
        if cdn == gap_cdn:
            aminos.append(set('-'))
            continue
        # otherwise, combinatorial fun
        nucs = []
        for nuc in cdn:
            if nuc in _NUC_AMBIGS:
                nucs.append(_NUC_AMBIGS[nuc])
            else:
                nucs.append(nuc)
        aminos.append(set(_translate(''.join(p)) for p in product(*nucs)))

    return AmbigList(aminos)
예제 #4
0
파일: __init__.py 프로젝트: nlhepler/BioExt
 def __init__(self, seq, prior=0):
     if isinstance(seq, SeqRecord):
         seq = str(seq.seq)
     elif isinstance(seq, Seq):
         seq = str(seq)
     elif not isinstance(seq, str):
         raise ValueError('seq must be of type SeqRecord, Seq, or str')
     table = _default_table(0)
     for i in range(0, len(seq), 3):
         j = i + 3
         if j > len(seq):
             continue
         cdn = seq[i:j].upper()
         aa = _translate(cdn)
         # skip unknown codons, they are irrelevant
         if aa == 'X':
             continue
         if cdn not in table[aa]:
             raise ValueError("sequence uses malformed alphabet '%s'" % cdn)
         table[aa][cdn] += 1
     for aa, cdns in table.items():
         total = prior * len(cdns) + sum(cdns.values())
         unif = 1. / len(cdns)
         cdf = []
         acc = 0.
         for cdn, count in sorted(cdns.items(), key=itemgetter(1)):
             if total:
                 pdf = (count + prior) / total
             else:
                 pdf = unif
             acc += pdf
             cdf.append((acc, cdn))
         table[aa] = cdf
     self.__table = table
예제 #5
0
파일: __init__.py 프로젝트: veg/BioExt
def _default_table(prior=0):
    table = {}
    for i in 'ACGT':
        for j in 'ACGT':
            for k in 'ACGT':
                cdn = ''.join((i, j, k))
                aa = _translate(cdn)
                if aa not in table:
                    table[aa] = Counter()
                table[aa].update({cdn: prior})
    # default the unkown amino acid to NNN in all cases
    table['X'] = Counter({'NNN': 1})
    return table
예제 #6
0
파일: dseqrecord.py 프로젝트: uswa1/pydna
 def __contains__(self, other):
     if other.lower() in str(self.seq).lower():
         return True
     else:
         s = self.seq.watson.replace(" ", "")
         ln = len(s)
         spc = 3 - ln % 3 if ln % 3 else 0
         s = "n" * spc + s + "nnn"
         for frame in range(3):
             if other.lower() in _translate(s[frame:frame + spc +
                                              ln]).lower():
                 return True
     return False
예제 #7
0
 def find_aminoacids(self, other):
     """
     >>> from pydna.dseqrecord import Dseqrecord
     >>> s=Dseqrecord("atgtacgatcgtatgctggttatattttag")
     >>> s.seq.translate()
     Seq('MYDRMLVIF*')
     >>> "RML" in s
     True
     >>> "MMM" in s
     False
     >>> s.seq.rc().translate()
     Seq('LKYNQHTIVH')
     >>> "QHT" in s.rc()
     True
     >>> "QHT" in s
     False
     >>> slc = s.find_aa("RML")
     >>> slc
     slice(9, 18, None)
     >>> s[slc]
     Dseqrecord(-9)
     >>> code = s[slc].seq
     >>> code
     Dseq(-9)
     cgtatgctg
     gcatacgac
     >>> code.translate()
     Seq('RML')
     """
     other = str(other).lower()
     assert self.seq.watson == "".join(self.seq.watson.split())
     s = self.seq.watson
     ln = len(s)
     spc = 3 - ln % 3 if ln % 3 else 0
     s = s + "n" * spc + "nnn"
     start = None
     for frame in range(3):
         try:
             start = _translate(s[frame:frame + ln +
                                  spc]).lower().index(other)
             break
         except ValueError:
             pass
     oh = self.seq.ovhg if self.seq.ovhg > 0 else 0
     if start == None:
         return None  # TODO return an emoty slice or False...?
     else:
         return slice(frame + start * 3 + oh,
                      frame + (start + len(other)) * 3 + oh)
예제 #8
0
파일: __init__.py 프로젝트: nlhepler/BioExt
    def __call__(
            self,
            ref,
            query,
            open_insertion=None,
            extend_insertion=None,
            open_deletion=None,
            extend_deletion=None,
            miscall_cost=None,
            do_local=None,
            do_affine=None
            ):

        # populate defaults from initialization
        if open_insertion is None:
            open_insertion = self.__open_insertion
        if extend_insertion is None:
            extend_insertion = self.__extend_insertion
        if open_deletion is None:
            open_deletion = self.__open_deletion
        if extend_deletion is None:
            extend_deletion = self.__extend_deletion
        if miscall_cost is None:
            miscall_cost = self.__miscall_cost
        if do_local is None:
            do_local = self.__do_local
        if do_affine is None:
            do_affine = self.__do_affine

        ref = gapless(ref)
        query = gapless(query)

        # if the reference and query are the same, we can return early
        if len(ref) and ref == query:
            if self.__do_codon:
                score = sum(self.__score_matrix[char, char] for char in _translate(ref))
            else:
                score = sum(self.__score_matrix[char, char] for char in ref)
            return score / len(ref), ref, query

        if isinstance(ref, SeqRecord):
            ref_ = str(ref.seq)
        elif isinstance(ref, Seq):
            ref_ = str(ref)
        else:
            ref_ = ref

        if isinstance(query, SeqRecord):
            query_ = str(query.seq)
        elif isinstance(query, Seq):
            query_ = str(query)
        else:
            query_ = query

        # convert to uppercase, because _align assumes it
        ref_ = ref_.upper()
        query_ = query_.upper()

        if self.__do_codon and len(ref_) % 3 != 0:
            raise ValueError('when do_codon = True, len(ref) must be a multiple of 3')

        # if do_codon, the query's length needs to be a multiple of 3
#         if self.__do_codon and len(query_) % 3 != 0:
#             ns = 3 - len(query_) % 3
#             query_ += 'N' * ns
#         else:
#             ns = 0

        # for shared memory safety, recreate matrices if the PID changed
        current_pid = getpid()
        if self.__cached_pid != current_pid:
            self.__cached_pid = current_pid
            self.__cached_score_matrix = np.empty((1,), dtype=float)
            self.__cached_deletion_matrix = np.empty((1,), dtype=float)
            self.__cached_insertion_matrix = np.empty((1,), dtype=float)

        if self.__do_codon:
            cache_size = (len(ref_) // 3 + 1) * (len(query_) + 1)
        else:
            cache_size = (len(ref_) + 1) * (len(query_) + 1)

        if self.__cached_score_matrix.shape[0] < cache_size:
            self.__cached_score_matrix.resize((cache_size,))

        if do_affine:
            if self.__cached_deletion_matrix.shape[0] < cache_size:
                self.__cached_deletion_matrix.resize((cache_size,))

            if self.__cached_insertion_matrix.shape[0] < cache_size:
                self.__cached_insertion_matrix.resize((cache_size,))

        if len(query) == 0:
            score, ref_aligned, query_aligned = float('-Inf'), ref_, '-' * len(ref_)
        else:
            score, ref_aligned, query_aligned = _align(
                ref_.encode('utf-8'),
                query_.encode('utf-8'),
                self.__nchars,
                self.__char_map,
                self.__score_matrix_,
                self.__score_matrix_.shape[0],
                open_insertion,
                extend_insertion,
                open_deletion,
                extend_deletion,
                miscall_cost,
                do_local,
                do_affine,
                self.__do_codon,
                self.__codon3x5,
                self.__codon3x4,
                self.__codon3x2,
                self.__codon3x1,
                self.__cached_score_matrix,
                self.__cached_deletion_matrix,
                self.__cached_insertion_matrix
                )

            if sys.version_info >= (3, 0):
                ref_aligned = ref_aligned.decode('utf-8')
                query_aligned = query_aligned.decode('utf-8')

        if isinstance(ref, SeqRecord):
            ref_aligned_ = SeqRecord(
                Seq(ref_aligned, ref.seq.alphabet),
                id=ref.id,
                name=ref.name,
                description=ref.description,
                dbxrefs=ref.dbxrefs,
                annotations=ref.annotations
                )
        elif isinstance(ref, Seq):
            ref_aligned_ = Seq(ref_aligned, ref.seq.alphabet)
        else:
            ref_aligned_ = ref_aligned

        if isinstance(query, SeqRecord):
            query_aligned_ = SeqRecord(
                Seq(query_aligned, query.seq.alphabet),
                id=query.id,
                name=query.name,
                description=query.description,
                dbxrefs=query.dbxrefs,
                annotations=query.annotations
                )
        elif isinstance(query, Seq):
            query_aligned_ = Seq(query_aligned, query.seq.alphabet)
        else:
            query_aligned_ = query_aligned

        # normalize score to per-position
        if len(query_):
            score /= (len(query_) / 3) if self.__do_codon else len(query_)

        return score, ref_aligned_, query_aligned_
예제 #9
0
파일: __init__.py 프로젝트: sdwfrost/BioExt
    def __call__(self,
                 ref,
                 query,
                 open_insertion=None,
                 extend_insertion=None,
                 open_deletion=None,
                 extend_deletion=None,
                 miscall_cost=None,
                 do_local=None,
                 do_affine=None):

        # populate defaults from initialization
        if open_insertion is None:
            open_insertion = self.__open_insertion
        if extend_insertion is None:
            extend_insertion = self.__extend_insertion
        if open_deletion is None:
            open_deletion = self.__open_deletion
        if extend_deletion is None:
            extend_deletion = self.__extend_deletion
        if miscall_cost is None:
            miscall_cost = self.__miscall_cost
        if do_local is None:
            do_local = self.__do_local
        if do_affine is None:
            do_affine = self.__do_affine

        ref = gapless(ref)
        query = gapless(query)

        # if the reference and query are the same, we can return early
        if len(ref) and ref == query:
            if self.__do_codon:
                score = sum(self.__score_matrix[char, char]
                            for char in _translate(ref))
            else:
                score = sum(self.__score_matrix[char, char] for char in ref)
            return score / len(ref), ref, query

        if isinstance(ref, SeqRecord):
            ref_ = str(ref.seq)
        elif isinstance(ref, Seq):
            ref_ = str(ref)
        else:
            ref_ = ref

        if isinstance(query, SeqRecord):
            query_ = str(query.seq)
        elif isinstance(query, Seq):
            query_ = str(query)
        else:
            query_ = query

        # convert to uppercase, because _align assumes it
        ref_ = ref_.upper()
        query_ = query_.upper()

        if self.__do_codon and len(ref_) % 3 != 0:
            raise ValueError(
                'when do_codon = True, len(ref) must be a multiple of 3')

        # if do_codon, the query's length needs to be a multiple of 3
#         if self.__do_codon and len(query_) % 3 != 0:
#             ns = 3 - len(query_) % 3
#             query_ += 'N' * ns
#         else:
#             ns = 0

# for shared memory safety, recreate matrices if the PID changed
        current_pid = getpid()
        if self.__cached_pid != current_pid:
            self.__cached_pid = current_pid
            self.__cached_score_matrix = np.empty((1, ), dtype=float)
            self.__cached_deletion_matrix = np.empty((1, ), dtype=float)
            self.__cached_insertion_matrix = np.empty((1, ), dtype=float)

        if self.__do_codon:
            cache_size = (len(ref_) // 3 + 1) * (len(query_) + 1)
        else:
            cache_size = (len(ref_) + 1) * (len(query_) + 1)

        if self.__cached_score_matrix.shape[0] < cache_size:
            self.__cached_score_matrix.resize((cache_size, ))

        if do_affine:
            if self.__cached_deletion_matrix.shape[0] < cache_size:
                self.__cached_deletion_matrix.resize((cache_size, ))

            if self.__cached_insertion_matrix.shape[0] < cache_size:
                self.__cached_insertion_matrix.resize((cache_size, ))

        if len(query) == 0:
            score, ref_aligned, query_aligned = float(
                '-Inf'), ref_, '-' * len(ref_)
        else:
            score, ref_aligned, query_aligned = _align(
                ref_.encode('utf-8'), query_.encode('utf-8'), self.__nchars,
                self.__char_map, self.__score_matrix_,
                self.__score_matrix_.shape[0], open_insertion,
                extend_insertion, open_deletion, extend_deletion, miscall_cost,
                do_local, do_affine, self.__do_codon, self.__codon3x5,
                self.__codon3x4, self.__codon3x2, self.__codon3x1,
                self.__cached_score_matrix, self.__cached_deletion_matrix,
                self.__cached_insertion_matrix)

            if sys.version_info >= (3, 0):
                ref_aligned = ref_aligned.decode('utf-8')
                query_aligned = query_aligned.decode('utf-8')

        if isinstance(ref, SeqRecord):
            ref_aligned_ = SeqRecord(Seq(ref_aligned, ref.seq.alphabet),
                                     id=ref.id,
                                     name=ref.name,
                                     description=ref.description,
                                     dbxrefs=ref.dbxrefs,
                                     annotations=ref.annotations)
        elif isinstance(ref, Seq):
            ref_aligned_ = Seq(ref_aligned, ref.seq.alphabet)
        else:
            ref_aligned_ = ref_aligned

        if isinstance(query, SeqRecord):
            query_aligned_ = SeqRecord(Seq(query_aligned, query.seq.alphabet),
                                       id=query.id,
                                       name=query.name,
                                       description=query.description,
                                       dbxrefs=query.dbxrefs,
                                       annotations=query.annotations)
        elif isinstance(query, Seq):
            query_aligned_ = Seq(query_aligned, query.seq.alphabet)
        else:
            query_aligned_ = query_aligned

        # normalize score to per-position
        if len(query_):
            score /= (len(query_) / 3) if self.__do_codon else len(query_)

        return score, ref_aligned_, query_aligned_