def compare(self, other_v, max_extent, max_streak):
        alignment = self.align(other_v)
        this_seq = alignment['base'][:max_extent]
        other_seq = alignment['seq'][:max_extent]
        cdr3_offset = alignment['cdr3_start']

        # Determine the CDR3 in the germline and sequence
        this_cdr3 = this_seq[cdr3_offset:]
        other_cdr3 = other_seq[cdr3_offset:]
        length = min(len(this_cdr3), len(other_cdr3))
        this_cdr3 = this_cdr3[:length]
        other_cdr3 = other_cdr3[:length]
        if len(this_cdr3) == 0 or len(other_cdr3) == 0:
            raise AlignmentException('Empty CDR3 found after alignment')

        # Find the extent of the sequence's V into the CDR3
        streak = dnautils.find_streak_position(
            this_cdr3, other_cdr3, max_streak)
        if streak is not None:
            # If there is a streak of mismatches, cut after the streak
            max_index = cdr3_offset + (streak - max_streak)
        else:
            # Unlikely: the CDR3 in the sequence exactly matches the
            # germline.  Use the smaller sequence length (full match)
            max_index = cdr3_offset + min(len(this_cdr3), len(other_cdr3))
        # Compare to the end of V
        this_seq = this_seq[:max_index]
        other_seq = other_seq[:max_index]

        if len(this_seq) != len(other_seq) or len(this_seq) == 0:
            raise AlignmentException('Unequal sequences after alignment')
        # Determine the distance between the germline and sequence
        dist = dnautils.hamming(this_seq, other_seq)

        return dist, len(other_seq)
    def get_single_tie(self, gene, length, mutation):
        # Used to disable gene ties for genotyping
        if self.no_ties:
            return set([gene])
        length = int(length)
        mutation = round(mutation, 3)
        mutation = self.mut_bucket(mutation)
        key = (length, mutation)

        if key not in self.ties:
            self.ties[key] = {}

        if gene not in self:
            return set([gene])

        if gene not in self.ties[key]:
            s_1 = (
                self[gene].replace('-', '') if self.remove_gaps else self[gene]
            )
            self.ties[key][gene] = set([gene])

            for name, v in sorted(self.items()):
                s_2 = v.replace('-', '') if self.remove_gaps else v
                K = dnautils.hamming(s_1[-length:], s_2[-length:])
                p = self._hypergeom(length, mutation, K)
                if p >= self.TIES_PROB_THRESHOLD:
                    self.ties[key][gene].add(name)
            self.ties[key][gene] = self.all_alleles(self.ties[key][gene])

        return self.ties[key][gene]
Example #3
0
    def compare(self, other_v, max_extent, max_streak):
        alignment = self.align(other_v)
        this_seq = alignment['base'][:max_extent]
        other_seq = alignment['seq'][:max_extent]
        cdr3_offset = alignment['cdr3_start']

        # Determine the CDR3 in the germline and sequence
        this_cdr3 = this_seq[cdr3_offset:]
        other_cdr3 = other_seq[cdr3_offset:]
        length = min(len(this_cdr3), len(other_cdr3))
        this_cdr3 = this_cdr3[:length]
        other_cdr3 = other_cdr3[:length]
        if len(this_cdr3) == 0 or len(other_cdr3) == 0:
            raise AlignmentException('Empty CDR3 found after alignment')

        # Find the extent of the sequence's V into the CDR3
        streak = dnautils.find_streak_position(this_cdr3, other_cdr3,
                                               max_streak)
        if streak is not None:
            # If there is a streak of mismatches, cut after the streak
            max_index = cdr3_offset + (streak - max_streak)
        else:
            # Unlikely: the CDR3 in the sequence exactly matches the
            # germline.  Use the smaller sequence length (full match)
            max_index = cdr3_offset + min(len(this_cdr3), len(other_cdr3))
        # Compare to the end of V
        this_seq = this_seq[:max_index]
        other_seq = other_seq[:max_index]

        if len(this_seq) != len(other_seq) or len(this_seq) == 0:
            raise AlignmentException('Unequal sequences after alignment')
        # Determine the distance between the germline and sequence
        dist = dnautils.hamming(this_seq, other_seq)

        return dist, len(other_seq)
Example #4
0
    def get_single_tie(self, gene, length, mutation):
        length = int(length)
        mutation = round(mutation, 3)
        mutation = self.mut_bucket(mutation)
        key = (length, mutation)

        if key not in self.ties:
            self.ties[key] = {}

        if gene not in self:
            return set([gene])

        if gene not in self.ties[key]:
            s_1 = (self[gene].replace('-', '')
                   if self.remove_gaps else self[gene])
            self.ties[key][gene] = set([gene])

            for name, v in sorted(self.iteritems()):
                s_2 = v.replace('-', '') if self.remove_gaps else v
                K = dnautils.hamming(s_1[-length:], s_2[-length:])
                p = self._hypergeom(length, mutation, K)
                if p >= self.ties_prob_threshold:
                    self.ties[key][gene].add(name)
            self.ties[key][gene] = self.all_alleles(self.ties[key][gene])

        return self.ties[key][gene]
Example #5
0
def collapse_similar_cdr3s(session, buckets, difference_allowed):
    logger.info('Collapsing similar clones in {} buckets'.format(
        buckets.count()))
    for i, bucket in enumerate(buckets):
        clones = session.query(Clone.id, Clone.cdr3_aa, Clone.cdr3_nt).filter(
            Clone.subject_id == bucket.subject_id,
            Clone.cdr3_num_nts == bucket.cdr3_num_nts,
        ).order_by(Clone.overall_total_cnt.desc())
        if clones.count() < 2:
            continue
        logger.info('Reducing bucket {} / {} ({} clones)'.format(
            i, buckets.count(), clones.count()))
        reduced = {}
        for c in clones:
            for larger_cdr3_nt, others in reduced.items():
                if (dnautils.hamming(larger_cdr3_nt, c.cdr3_nt) <=
                        difference_allowed):
                    others.append(c.id)
                    break
            else:
                reduced[c.cdr3_nt] = [c.id]

        for collapse in reduced.values():
            rep_id, others = collapse[0], collapse[1:]
            session.query(Sequence).filter(
                Sequence.clone_id.in_(others)).update(
                    {'clone_id': rep_id}, synchronize_session=False)
            session.query(Clone).filter(
                Clone.id.in_(others)).delete(synchronize_session=False)
    session.commit()
Example #6
0
    def get_single_tie(self, gene, length, mutation):
        # Used to disable gene ties for genotyping
        if self.no_ties:
            return set([gene])
        length = int(length)
        mutation = round(mutation, 3)
        mutation = self.mut_bucket(mutation)
        key = (length, mutation)

        if key not in self.ties:
            self.ties[key] = {}

        if gene not in self:
            return set([gene])

        if gene not in self.ties[key]:
            s_1 = (self[gene].replace('-', '')
                   if self.remove_gaps else self[gene])
            self.ties[key][gene] = set([gene])

            for name, v in sorted(self.items()):
                s_2 = v.replace('-', '') if self.remove_gaps else v
                K = dnautils.hamming(s_1[-length:], s_2[-length:])
                p = self._hypergeom(length, mutation, K)
                if p >= self.TIES_PROB_THRESHOLD:
                    self.ties[key][gene].add(name)
            self.ties[key][gene] = self.all_alleles(self.ties[key][gene])

        return self.ties[key][gene]
    def pre_cdr3_match(self):
        start = self.seq_start + self.num_gaps
        end = self.cdr3_start

        return self.pre_cdr3_length - dnautils.hamming(
            self.germline[start:end],
            self.sequence[start:end]
        )
    def v_match(self):
        start = self.seq_start
        end = start + self.v_length + self.num_gaps

        return self.v_length - dnautils.hamming(
            self.filled_germline[start:end],
            self.sequence[start:end]
        )
Example #9
0
    def v_match(self):
        start = self.seq_start
        end = start + self.v_length + self.num_gaps

        return self.v_length - dnautils.hamming(
            self.filled_germline[start:end],
            self.sequence[start:end]
        )
Example #10
0
    def pre_cdr3_match(self):
        start = self.seq_start + self.num_gaps
        end = self.cdr3_start

        return self.pre_cdr3_length - dnautils.hamming(
            self.germline[start:end],
            self.sequence[start:end]
        )
Example #11
0
 def get_single_tie(self, gene, length, mutation):
     seq = self[gene][-self.anchor_len:]
     tied = self.all_alleles(set([gene]))
     for j, other_seq in sorted(self.iteritems()):
         other_seq = other_seq[-self.anchor_len:][:len(seq)]
         if other_seq == seq:
             tied.add(j)
         elif dnautils.hamming(other_seq, seq) == 0:
             tied.add(j)
     return tied
Example #12
0
    def _find_index(self, sequence, germline):
        best_pos, best_hamming = None, None
        for pos in range(len(sequence) - len(germline)):
            hamming = dnautils.hamming(sequence[pos:pos + len(germline)],
                                       germline)
            if best_hamming is None or hamming < best_hamming:
                best_pos = pos
                best_hamming = hamming
                is_rc = False

        rc = sequence.reverse_complement()
        for pos in range(len(rc) - len(germline)):
            hamming = dnautils.hamming(rc[pos:pos + len(germline)], germline)
            if best_hamming is None or hamming < best_hamming:
                best_pos = pos
                best_hamming = hamming
                is_rc = True

        best_pos += len(germline) - self.j_germlines.anchor_len
        return best_pos, best_hamming, is_rc
    def _find_index(self, sequence, germline):
        best_pos, best_hamming = None, None
        for pos in range(len(sequence) - len(germline)):
            hamming = dnautils.hamming(sequence[pos:pos + len(germline)],
                                       germline) / len(germline)
            if best_hamming is None or hamming < best_hamming:
                best_pos = pos
                best_hamming = hamming
                is_rc = False

        rc = sequence.reverse_complement()
        for pos in range(len(rc) - len(germline) + 1):
            hamming = dnautils.hamming(rc[pos:pos + len(germline)],
                                       germline) / len(germline)
            if best_hamming is None or hamming < best_hamming:
                best_pos = pos
                best_hamming = hamming
                is_rc = True

        best_pos += len(germline) - self.j_germlines.anchor_len
        return best_pos, best_hamming, is_rc
 def get_single_tie(self, gene, length, mutation):
     # Used to disable gene ties for genotyping
     if self.no_ties:
         return set([gene])
     seq = self[gene][-self.anchor_len:]
     tied = self.all_alleles(set([gene]))
     for j, other_seq in sorted(self.items()):
         other_seq = other_seq[-self.anchor_len:][:len(seq)]
         if other_seq == seq:
             tied.add(j)
         elif dnautils.hamming(other_seq, seq) == 0:
             tied.add(j)
     return tied
Example #15
0
 def get_single_tie(self, gene, length, mutation):
     # Used to disable gene ties for genotyping
     if self.no_ties:
         return set([gene])
     seq = self[gene][-self.anchor_len:]
     tied = self.all_alleles(set([gene]))
     for j, other_seq in sorted(self.items()):
         other_seq = other_seq[-self.anchor_len:][:len(seq)]
         if other_seq == seq:
             tied.add(j)
         elif dnautils.hamming(other_seq, seq) == 0:
             tied.add(j)
     return tied
    def has_possible_indel(self):
        # Start comparison on first full AA to the INDEL_WINDOW or CDR3,
        # whichever comes first
        start = re.search('[ATCG]', self.sequence.sequence).start()
        germ = self.germline[start:self.cdr3_start]
        seq = self.sequence[start:self.cdr3_start]

        for i in range(0, len(germ) - self.INDEL_WINDOW + 1):
            dist = dnautils.hamming(germ[i:i+self.INDEL_WINDOW],
                                    seq[i:i+self.INDEL_WINDOW])
            if dist >= self.INDEL_MISMATCH_THRESHOLD * self.INDEL_WINDOW:
                return True

        return False
Example #17
0
    def has_possible_indel(self):
        # Start comparison on first full AA to the INDEL_WINDOW or CDR3,
        # whichever comes first
        start = re.search('[ATCG]', self.sequence).start()
        germ = self.germline[start:self.cdr3_start]
        seq = self.sequence[start:self.cdr3_start]

        for i in range(0, len(germ) - self.INDEL_WINDOW + 1):
            dist = dnautils.hamming(germ[i:i + self.INDEL_WINDOW],
                                    seq[i:i + self.INDEL_WINDOW])
            if dist >= self.INDEL_MISMATCH_THRESHOLD * self.INDEL_WINDOW:
                return True

        return False
Example #18
0
def similar_to_all(seq, rest, min_similarity):
    """Determines if the string ``seq`` is at least ``min_similarity``
    similar to the list of strings ``rest``.

    :param str seq: The string to compare
    :param list rest: The list of strings to compare to
    :param int min_similarity: Minimum fraction to be considered similar

    :returns: If ``seq`` is similar to every sequence in ``rest``
    :rtype: bool

    """
    for comp_seq in rest:
        dist = dnautils.hamming(comp_seq.cdr3_aa.replace('X', '-'),
                                seq.cdr3_aa.replace('X', '-'))
        sim_frac = 1 - dist / float(len(comp_seq.cdr3_aa))
        if sim_frac < min_similarity:
            return False
    return True
Example #19
0
def similar_to_all(seq, rest, field, min_similarity):
    """Determines if the string ``seq`` is at least ``min_similarity``
    similar to the list of strings ``rest``.

    :param str seq: The string to compare
    :param list rest: The list of strings to compare to
    :param int min_similarity: Minimum fraction to be considered similar

    :returns: If ``seq`` is similar to every sequence in ``rest``
    :rtype: bool

    """
    for comp_seq in rest:
        dist = dnautils.hamming(
            getattr(comp_seq, 'cdr3_' + field).replace('X', '-'),
            getattr(seq, 'cdr3_' + field).replace('X', '-')
        )
        sim_frac = 1 - dist / len(comp_seq.cdr3_aa)
        if sim_frac < min_similarity:
            return False
    return True
Example #20
0
    def _found_j(self, i, j_gene, match):
        # If a match is found, record its location and gene
        self.j_anchor_pos = i
        self.j_anchor_len = len(match)
        end_of_j = min(self.j_anchor_pos + self.j_germlines.anchor_len,
                       len(self.sequence))
        best_dist = None
        self._j = []
        if self._force_js:
            j_germs = {
                k: v
                for k, v in self.j_germlines.iteritems() if k in self._force_js
            }
        else:
            j_germs = self.j_germlines
        for j_gene, j_seq in j_germs.iteritems():
            seq_j = self.sequence[end_of_j - len(j_seq):end_of_j]
            dist = dnautils.hamming(seq_j, j_seq[:len(seq_j)])
            if best_dist is None or dist < best_dist:
                best_dist = dist
                self._j = set([j_gene])
            elif dist == best_dist:
                self._j.add(j_gene)

        if self._j is None:
            raise AlignmentException('Could not find suitable J anchor')

        # Get the full germline J gene
        j_full = self.j_germlines[self.j_gene[0]]

        # Get the portion of the germline J in the CDR3
        germline_in_cdr3 = self.j_germlines.get_j_in_cdr3(self.j_gene[0])
        cdr3_end_pos = (self.j_anchor_pos + self.j_germlines.anchor_len -
                        self.j_germlines.upstream_of_cdr3)
        sequence_in_cdr3 = self.sequence[cdr3_end_pos -
                                         len(germline_in_cdr3):cdr3_end_pos]
        if len(germline_in_cdr3) == 0 or len(sequence_in_cdr3) == 0:
            self._j = None
            raise AlignmentException('Could not find sequence or germline in '
                                     'CDR3')

        # Get the extent of the J in the CDR3
        streak = find_streak_position(reversed(germline_in_cdr3),
                                      reversed(sequence_in_cdr3),
                                      self.MISMATCH_THRESHOLD)

        # Trim the J gene based on the extent in the CDR3
        if streak is not None:
            j_full = j_full[len(germline_in_cdr3) - streak:]

        # Find where the full J starts
        self._j_start = self.j_anchor_pos + len(match) - len(j_full)

        # If the trimmed germline J extends past the end of the
        # sequence, there is a misalignment
        if len(j_full) != len(
                self.sequence[self._j_start:self._j_start + len(j_full)]):
            self._j = None
            self.j_anchor_pos = None
            raise AlignmentException('Germline extended past end of J')

        self.j_length = len(j_full)
Example #21
0
    def align_to_germline(self, avg_len=None, avg_mut=None, trim_to=None):
        if avg_len is not None and avg_mut is not None:
            self._v = self.v_germlines.get_ties(self.v_gene, avg_len, avg_mut)
            self._j = self.j_germlines.get_ties(self.j_gene, avg_len, avg_mut)
        # Set the germline to the V gene up to the CDR3
        self.germline = get_common_seq([self.v_germlines[v]
                                        for v in self._v])[:CDR3_OFFSET]
        # If we need to pad the sequence, do so, otherwise trim the sequence to
        # the germline length
        if self._pad_len >= 0:
            self.sequence = 'N' * self._pad_len + str(self.sequence)
            if self.quality is not None:
                self.quality = (' ' * self._pad_len) + self.quality
        else:
            self.removed_prefix = self.sequence[:-self._pad_len]
            self.sequence = str(self.sequence[-self._pad_len:])
            if self.quality is not None:
                self.removed_prefix_qual = self.quality[:-self._pad_len]
                self.quality = self.quality[-self._pad_len:]
        # Update the anchor positions after adding padding / trimming
        self.j_anchor_pos += self._pad_len

        # Add germline gaps to sequence before CDR3 and update anchor positions
        for i, c in enumerate(self.germline):
            if c == '-':
                self.sequence = self.sequence[:i] + '-' + self.sequence[i:]
                if self.quality is not None:
                    self.quality = self.quality[:i] + ' ' + self.quality[i:]
                self.j_anchor_pos += 1

        j_germ = get_common_seq(
            map(reversed, [self.j_germlines[j] for j in self.j_gene]))
        j_germ = ''.join(reversed(j_germ))
        # Calculate the length of the CDR3
        self._cdr3_len = (self.j_anchor_pos + self.j_germlines.anchor_len -
                          self.j_germlines.upstream_of_cdr3 - self.cdr3_start)

        if self._cdr3_len < 3:
            raise AlignmentException('CDR3 has no AAs'.format(self._cdr3_len))

        self.j_anchor_pos += self._cdr3_len
        # Fill germline CDR3 with gaps
        self.germline += '-' * self._cdr3_len
        self.germline += j_germ[-self.j_germlines.upstream_of_cdr3:]
        # If the sequence is longer than the germline, trim it
        if len(self.sequence) > len(self.germline):
            self.sequence = self.sequence[:len(self.germline)]
            if self.quality is not None:
                self.quality = self.quality[:len(self.germline)]
        elif len(self.sequence) < len(self.germline):
            self.sequence += 'N' * (len(self.germline) - len(self.sequence))
            if self.quality is not None:
                self.quality += ' ' * (len(self.germline) - len(self.quality))

        if trim_to is not None:
            old_padding = max(self._pad_len, 0)
            new_prefix = ''.join(
                [c if c == '-' else 'N' for c in self.sequence[:trim_to]])
            self.sequence = new_prefix + self.sequence[trim_to:]
            v_start = re.match('[N\-]*', self.sequence).span()[1]
            self._pad_len = self.sequence[:v_start].count('N')
            self.v_length -= self._pad_len - old_padding

        # Get the pre-CDR3 germline
        pre_cdr3_germ = self.germline[:self.cdr3_start]
        pre_cdr3_seq = self.sequence[:self.cdr3_start]

        # If there is padding, get rid of it in the sequence and align the
        # germline
        if self._pad_len > 0:
            pre_cdr3_germ = pre_cdr3_germ[self._pad_len:]
            pre_cdr3_seq = pre_cdr3_seq[self._pad_len:]

        # Calculate the pre-CDR3 length and distance
        self.pre_cdr3_length = len(pre_cdr3_seq)
        self.pre_cdr3_match = self.pre_cdr3_length - dnautils.hamming(
            str(pre_cdr3_seq), str(pre_cdr3_germ))

        # Get the length of J after the CDR3
        self.post_cdr3_length = self.j_germlines.upstream_of_cdr3
        # Get the sequence and germline sequences after CDR3
        post_j = j_germ[-self.post_cdr3_length:]
        post_s = self.sequence[-self.post_cdr3_length:]

        # Calculate their match count
        self.post_cdr3_match = self.post_cdr3_length - dnautils.hamming(
            post_j, post_s)

        self.v_match = self.v_length - dnautils.hamming(
            self.germline[:self.cdr3_start], self.sequence[:self.cdr3_start])

        self.j_match = self.j_length - dnautils.hamming(
            self.germline[-len(j_germ):], self.sequence[-len(j_germ):])
 def j_match(self):
     return self.j_length - dnautils.hamming(
         self.filled_germline[-self.j_length:],
         self.sequence[-self.j_length:]
     )
Example #23
0
    def process_j(self, alignment, i, match_len, limit_js):
        # If a match is found, record its location and gene
        alignment.j_anchor_pos = i
        end_of_j = min(alignment.j_anchor_pos + self.j_germlines.anchor_len,
                       len(alignment.sequence))
        best_dist = None
        if limit_js:
            j_germs = {
                k: v
                for k, v in self.j_germlines.items() if k.name in limit_js
            }
        else:
            j_germs = self.j_germlines

        for j_gene, j_seq in j_germs.items():
            seq_j = alignment.sequence[end_of_j - len(j_seq):end_of_j]
            dist = dnautils.hamming(seq_j, j_seq[:len(seq_j)])
            if best_dist is None or dist < best_dist:
                best_dist = dist
                alignment.j_gene = set([j_gene])
            elif dist == best_dist:
                alignment.j_gene.add(j_gene)

        if len(alignment.j_gene) == 0:
            raise AlignmentException('Could not find suitable J anchor')

        # Get the full germline J gene
        ex_j = sorted(alignment.j_gene)[0]
        j_full = self.j_germlines[ex_j]

        # Get the portion of the germline J in the CDR3
        germline_in_cdr3 = self.j_germlines.get_j_in_cdr3(ex_j)
        cdr3_end_pos = (alignment.j_anchor_pos + self.j_germlines.anchor_len -
                        self.j_germlines.upstream_of_cdr3)
        sequence_in_cdr3 = alignment.sequence[cdr3_end_pos -
                                              len(germline_in_cdr3
                                                  ):cdr3_end_pos]
        if len(germline_in_cdr3) == 0 or len(sequence_in_cdr3) == 0:
            alignment.j_gene = set()
            raise AlignmentException('Could not find sequence or germline in '
                                     'CDR3')

        # Get the extent of the J in the CDR3
        streak = dnautils.find_streak_position(germline_in_cdr3[::-1],
                                               sequence_in_cdr3[::-1],
                                               self.MISMATCH_THRESHOLD)

        # Trim the J gene based on the extent in the CDR3
        if streak is not None:
            j_full = j_full[len(germline_in_cdr3) - streak:]
            alignment.germline_cdr3 = germline_in_cdr3[-streak:]
        else:
            alignment.germline_cdr3 = germline_in_cdr3

        # Find where the full J starts
        j_start = alignment.j_anchor_pos + match_len - len(j_full)

        # If the trimmed germline J extends past the end of the
        # sequence, there is a misalignment
        if len(j_full) != len(
                alignment.sequence[j_start:j_start + len(j_full)]):
            alignment.j_gene = set()
            raise AlignmentException('Germline extended past end of J')

        alignment.j_length = len(j_full)
        alignment.post_cdr3_length = self.j_germlines.upstream_of_cdr3
    def process_j(self, alignment, i, match_len, limit_js):
        # If a match is found, record its location and gene
        alignment.j_anchor_pos = i
        end_of_j = min(
            alignment.j_anchor_pos + self.j_germlines.anchor_len,
            len(alignment.sequence)
        )
        if limit_js:
            j_germs = {
                k: v for k, v in self.j_germlines.items()
                if k.name in limit_js
            }
        else:
            j_germs = self.j_germlines

        best_dist = None
        for j_gene, j_seq in j_germs.items():
            seq_j = alignment.sequence[end_of_j - len(j_seq):end_of_j]
            if seq_j:
                dist = dnautils.hamming(seq_j, j_seq[:len(seq_j)]) / len(seq_j)
                if best_dist is None or dist < best_dist:
                    best_dist = dist
                    alignment.j_gene = set([j_gene])
                elif dist == best_dist:
                    alignment.j_gene.add(j_gene)

        if len(alignment.j_gene) == 0:
            raise AlignmentException('Could not find suitable J anchor')

        # Get the full germline J gene
        ex_j = sorted(alignment.j_gene)[0]
        j_full = self.j_germlines[ex_j]

        # Get the portion of the germline J in the CDR3
        germline_in_cdr3 = self.j_germlines.get_j_in_cdr3(ex_j)
        cdr3_end_pos = (
            alignment.j_anchor_pos + self.j_germlines.anchor_len -
            self.j_germlines.upstream_of_cdr3
        )
        sequence_in_cdr3 = alignment.sequence[
            cdr3_end_pos - len(germline_in_cdr3):
            cdr3_end_pos
        ]
        if len(germline_in_cdr3) == 0 or len(sequence_in_cdr3) == 0:
            alignment.j_gene = set()
            raise AlignmentException('Could not find sequence or germline in '
                                     'CDR3')

        # Get the extent of the J in the CDR3
        streak = dnautils.find_streak_position(
            germline_in_cdr3[::-1],
            sequence_in_cdr3[::-1],
            self.MISMATCH_THRESHOLD)

        # Trim the J gene based on the extent in the CDR3
        if streak is not None:
            j_full = j_full[len(germline_in_cdr3) - streak:]
            alignment.germline_cdr3 = germline_in_cdr3[-streak:]
        else:
            alignment.germline_cdr3 = germline_in_cdr3

        # Find where the full J starts
        j_start = alignment.j_anchor_pos + match_len - len(j_full)

        # If the trimmed germline J extends past the end of the
        # sequence, there is a misalignment
        if len(j_full) != len(alignment.sequence[j_start:j_start+len(j_full)]):
            alignment.j_gene = set()
            raise AlignmentException('Germline extended past end of J')

        alignment.j_length = len(j_full)
        alignment.post_cdr3_length = self.j_germlines.upstream_of_cdr3
Example #25
0
 def post_cdr3_match(self):
     return self.post_cdr3_length - dnautils.hamming(
         self.germline[-self.post_cdr3_length:],
         self.sequence[-self.post_cdr3_length:])
Example #26
0
 def get_distances(self, seqs):
     dists = np.zeros((len(seqs), len(seqs)))
     for i, s1 in enumerate(seqs):
         for j, s2 in enumerate(seqs):
             dists[i, j] = dists[j, i] = dnautils.hamming(s1, s2) / len(s1)
     return dists
Example #27
0
 def j_match(self):
     return self.j_length - dnautils.hamming(
         self.filled_germline[-self.j_length:],
         self.sequence[-self.j_length:])
 def post_cdr3_match(self):
     return self.post_cdr3_length - dnautils.hamming(
         self.germline[-self.post_cdr3_length:],
         self.sequence[-self.post_cdr3_length:]
     )