Example #1
0
    def set_final_cyst_tryp_positions(self,
                                      debug=False,
                                      total_length_from_right=-1):
        """ Set tryp position in the final, combined sequence. """
        self.final_cyst_position = self.local_cyst_position - self.effective_erosions[
            'v_5p']
        self.final_tryp_position = utils.find_tryp_in_joined_seq(
            self.local_tryp_position, self.eroded_seqs['v'],
            self.insertions['vd'], self.eroded_seqs['d'],
            self.insertions['dj'], self.eroded_seqs['j'],
            self.erosions['j_5p'])
        if debug:
            print '  final tryptophan position: %d' % self.final_tryp_position
        # make sure cdr3 length matches the desired length in vdj_combo_label
        final_cdr3_length = self.final_tryp_position - self.final_cyst_position + 3
        if debug:
            print '  final_tryp_position - final_cyst_position + 3 = %d - %d + 3 = %d (should be %d)' % (
                self.final_tryp_position, self.final_cyst_position,
                final_cdr3_length, self.cdr3_length)
        utils.check_both_conserved_codons(
            self.eroded_seqs['v'] + self.insertions['vd'] +
            self.eroded_seqs['d'] + self.insertions['dj'] +
            self.eroded_seqs['j'], self.final_cyst_position,
            self.final_tryp_position)

        assert final_cdr3_length == int(self.cdr3_length)

        assert total_length_from_right == -1  # deprecated (I think) now that I'm adding the mimic_data_read_length option
Example #2
0
 def set_final_cyst_tryp_positions(self, debug=False):
     """ Set tryp position in the final, combined sequence. """
     self.final_cyst_position = self.local_cyst_position - self.effective_erosions['v_5p']
     self.final_tryp_position = utils.find_tryp_in_joined_seq(self.local_tryp_position,
                                                             self.eroded_seqs['v'],
                                                             self.insertions['vd'],
                                                             self.eroded_seqs['d'],
                                                             self.insertions['dj'],
                                                             self.eroded_seqs['j'],
                                                             self.erosions['j_5p'])
     if debug:
         print '  final tryptophan position: %d' % self.final_tryp_position
     # make sure cdr3 length matches the desired length in vdj_combo_label
     final_cdr3_length = self.final_tryp_position - self.final_cyst_position + 3
     if debug:
         print '  final_tryp_position - final_cyst_position + 3 = %d - %d + 3 = %d (should be %d)' % (self.final_tryp_position, self.final_cyst_position, final_cdr3_length, self.cdr3_length)
     utils.check_both_conserved_codons(self.eroded_seqs['v'] + self.insertions['vd'] + self.eroded_seqs['d'] + self.insertions['dj'] + self.eroded_seqs['j'], self.final_cyst_position, self.final_tryp_position)
         
     assert final_cdr3_length == int(self.cdr3_length)
Example #3
0
    def set_final_cyst_tryp_positions(self, debug=False, total_length_from_right=-1):
        """ Set tryp position in the final, combined sequence. """
        self.final_cyst_position = self.local_cyst_position - self.effective_erosions["v_5p"]
        self.final_tryp_position = utils.find_tryp_in_joined_seq(
            self.local_tryp_position,
            self.eroded_seqs["v"],
            self.insertions["vd"],
            self.eroded_seqs["d"],
            self.insertions["dj"],
            self.eroded_seqs["j"],
            self.erosions["j_5p"],
        )
        if debug:
            print "  final tryptophan position: %d" % self.final_tryp_position
        # make sure cdr3 length matches the desired length in vdj_combo_label
        final_cdr3_length = self.final_tryp_position - self.final_cyst_position + 3
        if debug:
            print "  final_tryp_position - final_cyst_position + 3 = %d - %d + 3 = %d (should be %d)" % (
                self.final_tryp_position,
                self.final_cyst_position,
                final_cdr3_length,
                self.cdr3_length,
            )
        utils.check_both_conserved_codons(
            self.eroded_seqs["v"]
            + self.insertions["vd"]
            + self.eroded_seqs["d"]
            + self.insertions["dj"]
            + self.eroded_seqs["j"],
            self.final_cyst_position,
            self.final_tryp_position,
        )

        assert final_cdr3_length == int(self.cdr3_length)

        assert (
            total_length_from_right == -1
        )  # deprecated (I think) now that I'm adding the mimic_data_read_length option
Example #4
0
    def summarize_query(self, query_name, query_seq, all_match_names, all_query_bounds, all_germline_bounds, warnings, first_match_query_bounds):
        if self.debug:
            print '%s' % query_name

        best, match_names, n_matches = {}, {}, {}
        n_used = {'v':0, 'd':0, 'j':0}
        k_v_min, k_d_min = 999, 999
        k_v_max, k_d_max = 0, 0
        for region in utils.regions:
            all_match_names[region] = sorted(all_match_names[region], reverse=True)
            match_names[region] = []
        codon_positions = {'v':-1, 'd':-1, 'j':-1}  # conserved codon positions (v:cysteine, d:dummy, j:tryptophan)
        for region in utils.regions:
            n_matches[region] = len(all_match_names[region])
            n_skipped = 0
            for score, gene in all_match_names[region]:
                glbounds = all_germline_bounds[gene]
                qrbounds = all_query_bounds[gene]
                assert qrbounds[1] <= len(query_seq)  # NOTE I'm putting these up avove as well (in process_query), so in time I should remove them from here
                assert glbounds[1] <= len(self.germline_seqs[region][gene])
                assert qrbounds[0] >= 0
                assert glbounds[0] >= 0
                glmatchseq = self.germline_seqs[region][gene][glbounds[0]:glbounds[1]]

                # TODO since I'm no longer skipping the genes after the first <args.n_max_per_region>, the OR of k-space below is overly conservative

                # only use a specified set of genes
                if self.args.only_genes is not None and gene not in self.args.only_genes:
                    n_skipped += 1
                    continue

                # add match to the list
                n_used[region] += 1
                match_names[region].append(gene)

                self.print_match(region, gene, query_seq, score, glbounds, qrbounds, -1, warnings, skipping=False)

                # if the germline match and the query match aren't the same length, s-w likely added an insert, which we shouldn't get since the gap-open penalty is jacked up so high
                if len(glmatchseq) != len(query_seq[qrbounds[0]:qrbounds[1]]):  # neurotic double check (um, I think) EDIT hey this totally saved my ass
                    print 'ERROR %d not same length' % query_name
                    print glmatchseq, glbounds[0], glbounds[1]
                    print query_seq[qrbounds[0]:qrbounds[1]]
                    assert False

                if region == 'v':
                    this_k_v = all_query_bounds[gene][1]  # NOTE even if the v match doesn't start at the left hand edge of the query sequence, we still measure k_v from there.
                                                          # In other words, sw doesn't tell the hmm about it
                    k_v_min = min(this_k_v, k_v_min)
                    k_v_max = max(this_k_v, k_v_max)
                if region == 'd':
                    this_k_d = all_query_bounds[gene][1] - first_match_query_bounds[1]  # end of d minus end of v
                    k_d_min = min(this_k_d, k_d_min)
                    k_d_max = max(this_k_d, k_d_max)

                # check consistency with best match (since the best match is excised in s-w code, and because ham is run with *one* k_v k_d set)
                if region not in best:
                    best[region] = gene
                    best[region + '_gl_seq'] = self.germline_seqs[region][gene][glbounds[0]:glbounds[1]]
                    best[region + '_qr_seq'] = query_seq[qrbounds[0]:qrbounds[1]]
                    best[region + '_score'] = score

            if self.debug and n_skipped > 0:
                print '%8s skipped %d %s genes' % ('', n_skipped, region)

        for region in utils.regions:
            if region not in best:
                print '      no', region, 'match found for', query_name  # NOTE if no d match found, we should really just assume entire d was eroded
                return

        # s-w allows d and j matches to overlap, so we need to apportion the disputed bases
        try:
            self.shift_overlapping_boundaries(all_query_bounds, all_germline_bounds, query_name, query_seq, best)
        except AssertionError:
            print '%s: apportionment failed' % query_name
            return

        # check for unproductive rearrangements
        for region in utils.regions:
            codon_positions[region] = utils.get_conserved_codon_position(self.cyst_positions, self.tryp_positions, region, best[region], all_germline_bounds[best[region]], all_query_bounds[best[region]], assert_on_fail=False)  # position in the query sequence, that is
        codons_ok = utils.check_both_conserved_codons(query_seq, codon_positions['v'], codon_positions['j'], debug=self.debug, extra_str='      ', assert_on_fail=False)
        cdr3_length = codon_positions['j'] - codon_positions['v'] + 3
        in_frame_cdr3 = (cdr3_length % 3 == 0)
        if self.debug and not in_frame_cdr3:
                print '      out of frame cdr3: %d %% 3 = %d' % (cdr3_length, cdr3_length % 3)
        no_stop_codon = utils.stop_codon_check(query_seq, codon_positions['v'], debug=self.debug)
        if not codons_ok or not in_frame_cdr3 or not no_stop_codon:
            if self.debug:
                print '       unproductive rearrangement in waterer codons_ok: %s   in_frame_cdr3: %s   no_stop_codon: %s' % (codons_ok, in_frame_cdr3, no_stop_codon)
            if self.args.skip_unproductive:
                if self.debug:
                    print '            ...skipping'
                self.n_unproductive += 1
                self.info['skipped_unproductive_queries'].append(query_name)
                return

        # best k_v, k_d:
        k_v = all_query_bounds[best['v']][1]  # end of v match
        k_d = all_query_bounds[best['d']][1] - all_query_bounds[best['v']][1]  # end of d minus end of v

        if k_d_max < 5:  # since the s-w step matches to the longest possible j and then excises it, this sometimes gobbles up the d, resulting in a very short d alignment.
            if self.debug:
                print '  expanding k_d'
            k_d_max = max(8, k_d_max)

        if 'IGHJ4*' in best['j'] and self.germline_seqs['d'][best['d']][-5:] == 'ACTAC':  # the end of some d versions is the same as the start of some j versions, so the s-w frequently kicks out the 'wrong' alignment
            if self.debug:
                print '  doubly expanding k_d'
            if k_d_max-k_d_min < 8:
                k_d_min -= 5
                k_d_max += 2

        k_v_min = max(0, k_v_min - self.args.default_v_fuzz)  # ok, so I don't *actually* want it to be zero... oh, well
        k_v_max += self.args.default_v_fuzz
        k_d_min = max(1, k_d_min - self.args.default_d_fuzz)
        k_d_max += self.args.default_d_fuzz
        assert k_v_min > 0 and k_d_min > 0 and k_v_max > 0 and k_d_max > 0

        if self.debug:
            print '         k_v: %d [%d-%d)' % (k_v, k_v_min, k_v_max)
            print '         k_d: %d [%d-%d)' % (k_d, k_d_min, k_d_max)
            print '         used',
            for region in utils.regions:
                print ' %s: %d/%d' % (region, n_used[region], n_matches[region]),
            print ''


        kvals = {}
        kvals['v'] = {'best':k_v, 'min':k_v_min, 'max':k_v_max}
        kvals['d'] = {'best':k_d, 'min':k_d_min, 'max':k_d_max}
        self.add_to_info(query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions=codon_positions)
    def summarize_query(self, query_name, query_seq, all_match_names,
                        all_query_bounds, all_germline_bounds, warnings,
                        first_match_query_bounds):
        if self.debug:
            print '%s' % query_name

        best, match_names, n_matches = {}, {}, {}
        n_used = {'v': 0, 'd': 0, 'j': 0}
        k_v_min, k_d_min = 999, 999
        k_v_max, k_d_max = 0, 0
        for region in utils.regions:
            all_match_names[region] = sorted(all_match_names[region],
                                             reverse=True)
            match_names[region] = []
        codon_positions = {
            'v': -1,
            'd': -1,
            'j': -1
        }  # conserved codon positions (v:cysteine, d:dummy, j:tryptophan)
        for region in utils.regions:
            n_matches[region] = len(all_match_names[region])
            n_skipped = 0
            for score, gene in all_match_names[region]:
                glbounds = all_germline_bounds[gene]
                qrbounds = all_query_bounds[gene]
                assert qrbounds[1] <= len(
                    query_seq
                )  # NOTE I'm putting these up avove as well (in process_query), so in time I should remove them from here
                assert glbounds[1] <= len(self.germline_seqs[region][gene])
                assert qrbounds[0] >= 0
                assert glbounds[0] >= 0
                glmatchseq = self.germline_seqs[region][gene][
                    glbounds[0]:glbounds[1]]

                # TODO since I'm no longer skipping the genes after the first <args.n_max_per_region>, the OR of k-space below is overly conservative

                # only use a specified set of genes
                if self.args.only_genes is not None and gene not in self.args.only_genes:
                    n_skipped += 1
                    continue

                # add match to the list
                n_used[region] += 1
                match_names[region].append(gene)

                self.print_match(region,
                                 gene,
                                 query_seq,
                                 score,
                                 glbounds,
                                 qrbounds,
                                 -1,
                                 warnings,
                                 skipping=False)

                # if the germline match and the query match aren't the same length, s-w likely added an insert, which we shouldn't get since the gap-open penalty is jacked up so high
                if len(glmatchseq) != len(
                        query_seq[qrbounds[0]:qrbounds[1]]
                ):  # neurotic double check (um, I think) EDIT hey this totally saved my ass
                    print 'ERROR %d not same length' % query_name
                    print glmatchseq, glbounds[0], glbounds[1]
                    print query_seq[qrbounds[0]:qrbounds[1]]
                    assert False

                if region == 'v':
                    this_k_v = all_query_bounds[gene][
                        1]  # NOTE even if the v match doesn't start at the left hand edge of the query sequence, we still measure k_v from there.
                    # In other words, sw doesn't tell the hmm about it
                    k_v_min = min(this_k_v, k_v_min)
                    k_v_max = max(this_k_v, k_v_max)
                if region == 'd':
                    this_k_d = all_query_bounds[gene][
                        1] - first_match_query_bounds[
                            1]  # end of d minus end of v
                    k_d_min = min(this_k_d, k_d_min)
                    k_d_max = max(this_k_d, k_d_max)

                # check consistency with best match (since the best match is excised in s-w code, and because ham is run with *one* k_v k_d set)
                if region not in best:
                    best[region] = gene
                    best[region + '_gl_seq'] = self.germline_seqs[region][
                        gene][glbounds[0]:glbounds[1]]
                    best[region +
                         '_qr_seq'] = query_seq[qrbounds[0]:qrbounds[1]]
                    best[region + '_score'] = score

            if self.debug and n_skipped > 0:
                print '%8s skipped %d %s genes' % ('', n_skipped, region)

        for region in utils.regions:
            if region not in best:
                print '      no', region, 'match found for', query_name  # NOTE if no d match found, we should really just assume entire d was eroded
                return

        # s-w allows d and j matches to overlap, so we need to apportion the disputed bases
        try:
            self.shift_overlapping_boundaries(all_query_bounds,
                                              all_germline_bounds, query_name,
                                              query_seq, best)
        except AssertionError:
            print '%s: apportionment failed' % query_name
            return

        # check for unproductive rearrangements
        for region in utils.regions:
            codon_positions[region] = utils.get_conserved_codon_position(
                self.cyst_positions,
                self.tryp_positions,
                region,
                best[region],
                all_germline_bounds[best[region]],
                all_query_bounds[best[region]],
                assert_on_fail=False
            )  # position in the query sequence, that is
        codons_ok = utils.check_both_conserved_codons(query_seq,
                                                      codon_positions['v'],
                                                      codon_positions['j'],
                                                      debug=self.debug,
                                                      extra_str='      ',
                                                      assert_on_fail=False)
        cdr3_length = codon_positions['j'] - codon_positions['v'] + 3
        in_frame_cdr3 = (cdr3_length % 3 == 0)
        if self.debug and not in_frame_cdr3:
            print '      out of frame cdr3: %d %% 3 = %d' % (cdr3_length,
                                                             cdr3_length % 3)
        no_stop_codon = utils.stop_codon_check(query_seq,
                                               codon_positions['v'],
                                               debug=self.debug)
        if not codons_ok or not in_frame_cdr3 or not no_stop_codon:
            if self.debug:
                print '       unproductive rearrangement in waterer codons_ok: %s   in_frame_cdr3: %s   no_stop_codon: %s' % (
                    codons_ok, in_frame_cdr3, no_stop_codon)
            if self.args.skip_unproductive:
                if self.debug:
                    print '            ...skipping'
                self.n_unproductive += 1
                self.info['skipped_unproductive_queries'].append(query_name)
                return

        # best k_v, k_d:
        k_v = all_query_bounds[best['v']][1]  # end of v match
        k_d = all_query_bounds[best['d']][1] - all_query_bounds[best['v']][
            1]  # end of d minus end of v

        if k_d_max < 5:  # since the s-w step matches to the longest possible j and then excises it, this sometimes gobbles up the d, resulting in a very short d alignment.
            if self.debug:
                print '  expanding k_d'
            k_d_max = max(8, k_d_max)

        if 'IGHJ4*' in best['j'] and self.germline_seqs['d'][best['d']][
                -5:] == 'ACTAC':  # the end of some d versions is the same as the start of some j versions, so the s-w frequently kicks out the 'wrong' alignment
            if self.debug:
                print '  doubly expanding k_d'
            if k_d_max - k_d_min < 8:
                k_d_min -= 5
                k_d_max += 2

        k_v_min = max(
            0, k_v_min - self.args.default_v_fuzz
        )  # ok, so I don't *actually* want it to be zero... oh, well
        k_v_max += self.args.default_v_fuzz
        k_d_min = max(1, k_d_min - self.args.default_d_fuzz)
        k_d_max += self.args.default_d_fuzz
        assert k_v_min > 0 and k_d_min > 0 and k_v_max > 0 and k_d_max > 0

        if self.debug:
            print '         k_v: %d [%d-%d)' % (k_v, k_v_min, k_v_max)
            print '         k_d: %d [%d-%d)' % (k_d, k_d_min, k_d_max)
            print '         used',
            for region in utils.regions:
                print ' %s: %d/%d' % (region, n_used[region],
                                      n_matches[region]),
            print ''

        kvals = {}
        kvals['v'] = {'best': k_v, 'min': k_v_min, 'max': k_v_max}
        kvals['d'] = {'best': k_d, 'min': k_d_min, 'max': k_d_max}
        self.add_to_info(query_name,
                         query_seq,
                         kvals,
                         match_names,
                         best,
                         all_germline_bounds,
                         all_query_bounds,
                         codon_positions=codon_positions)
Example #6
0
    def summarize_query(self, query_name, query_seq, all_match_names, all_query_bounds, all_germline_bounds, warnings, first_match_query_bounds, queries_to_rerun):
        best, match_names = {}, {}
        k_v_min, k_d_min = 999, 999
        k_v_max, k_d_max = 0, 0
        for region in utils.regions:
            all_match_names[region] = sorted(all_match_names[region], reverse=True)
            match_names[region] = []
        if self.debug >= 2:
            print query_name
        for region in utils.regions:
            for score, gene in all_match_names[region]:
                glbounds = all_germline_bounds[gene]
                qrbounds = all_query_bounds[gene]
                assert qrbounds[1] <= len(query_seq)  # NOTE I'm putting these up above as well (in process_query), so in time I should remove them from here
                assert glbounds[1] <= len(self.glfo['seqs'][region][gene])
                assert qrbounds[0] >= 0
                assert glbounds[0] >= 0
                glmatchseq = self.glfo['seqs'][region][gene][glbounds[0]:glbounds[1]]

                match_names[region].append(gene)

                if self.debug >= 2:
                    self.print_match(region, gene, query_seq, score, glbounds, qrbounds, -1, warnings, skipping=False)

                # if the germline match and the query match aren't the same length, s-w likely added an insert, which we shouldn't get since the gap-open penalty is jacked up so high
                if len(glmatchseq) != len(query_seq[qrbounds[0]:qrbounds[1]]):  # neurotic double check (um, I think) EDIT hey this totally saved my ass
                    print 'ERROR %d not same length' % query_name
                    print glmatchseq, glbounds[0], glbounds[1]
                    print query_seq[qrbounds[0]:qrbounds[1]]
                    assert False

                # NOTE since I'm no longer skipping the genes after the first <args.n_max_per_region>, the OR of k-space below is overly conservative. UPDATE not sure if this is still relevant, but I'll move it down here in case I feel like thinking about it later
                if region == 'v':
                    this_k_v = all_query_bounds[gene][1]  # NOTE even if the v match doesn't start at the left hand edge of the query sequence, we still measure k_v from there.
                                                          # In other words, sw doesn't tell the hmm about it
                    k_v_min = min(this_k_v, k_v_min)
                    k_v_max = max(this_k_v, k_v_max)
                if region == 'd':
                    this_k_d = all_query_bounds[gene][1] - first_match_query_bounds[1]  # end of d minus end of v
                    k_d_min = min(this_k_d, k_d_min)
                    k_d_max = max(this_k_d, k_d_max)

                # check consistency with best match (since the best match is excised in s-w code, and because ham is run with *one* k_v k_d set)
                if region not in best:
                    best[region] = gene
                    best[region + '_gl_seq'] = self.glfo['seqs'][region][gene][glbounds[0]:glbounds[1]]
                    best[region + '_qr_seq'] = query_seq[qrbounds[0]:qrbounds[1]]
                    best[region + '_score'] = score

        for region in utils.regions:
            if region not in best:
                if self.debug:
                    print '      no', region, 'match found for', query_name  # NOTE if no d match found, we should really just assume entire d was eroded
                queries_to_rerun['no-match'].add(query_name)
                return

        # s-w allows d and j matches to overlap, so we need to apportion the disputed bases
        region_pairs = ({'left':'v', 'right':'d'}, {'left':'d', 'right':'j'})
        for rpair in region_pairs:
            overlap_status = self.check_boundaries(rpair, all_query_bounds, all_germline_bounds, query_name, query_seq, best)
            if overlap_status == 'overlap':
                self.shift_overlapping_boundaries(rpair, all_query_bounds, all_germline_bounds, query_name, query_seq, best)
            elif overlap_status == 'nonsense':
                queries_to_rerun['nonsense-bounds'].add(query_name)
                return
            else:
                assert overlap_status == 'ok'

        # check for suspiciously bad annotations
        vd_insertion = query_seq[all_query_bounds[best['v']][1] : all_query_bounds[best['d']][0]]
        dj_insertion = query_seq[all_query_bounds[best['d']][1] : all_query_bounds[best['j']][0]]
        if self.nth_try < 2:
            if len(vd_insertion) > self.max_insertion_length or len(dj_insertion) > self.max_insertion_length:
                if self.debug:
                    print '      suspiciously long insertion in %s, rerunning' % query_name
                queries_to_rerun['weird-annot.'].add(query_name)
                return
        if len(vd_insertion) > self.absolute_max_insertion_length or len(dj_insertion) > self.absolute_max_insertion_length:
            if self.debug:
                print '      suspiciously long insertion in %s, rerunning' % query_name
            queries_to_rerun['weird-annot.'].add(query_name)
            return

        if self.debug:
            print query_name

        # set and check conserved codon positions
        tmp_gl_positions = {'v' : self.glfo['cyst-positions'], 'j' : self.glfo['tryp-positions']}  # hack hack hack
        codon_positions = {}
        for region in ['v', 'j']:
            pos = tmp_gl_positions[region][best[region]] - all_germline_bounds[best[region]][0] + all_query_bounds[best[region]][0]  # position within original germline gene, minus the position in that germline gene at which the match starts, plus the position in the query sequence at which the match starts
            if pos < 0 or pos >= len(query_seq):
                if self.debug:
                    print '      invalid %s codon position (%d in seq of length %d), rerunning' % (region, pos, len(query_seq))
                queries_to_rerun['invalid-codon'].add(query_name)
                return
            codon_positions[region] = pos

        # check for unproductive rearrangements
        codons_ok = utils.check_both_conserved_codons(query_seq, codon_positions['v'], codon_positions['j'], assert_on_fail=False)
        cdr3_length = codon_positions['j'] - codon_positions['v'] + 3

        if cdr3_length < 6:  # NOTE six is also hardcoded in utils
            if self.debug:
                print '      negative cdr3 length %d' % (cdr3_length)
            queries_to_rerun['invalid-codon'].add(query_name)
            return

        in_frame_cdr3 = (cdr3_length % 3 == 0)
        no_stop_codon = utils.stop_codon_check(query_seq, codon_positions['v'])
        if not codons_ok or not in_frame_cdr3 or not no_stop_codon:
            if self.debug:
                print '       unproductive rearrangement:',
                if not codons_ok:
                    print '  bad codons',
                if not in_frame_cdr3:
                    print '  out of frame cdr3',
                if not no_stop_codon:
                    print '  stop codon'
                print ''

            if self.nth_try < 2 and (not codons_ok or not in_frame_cdr3):  # rerun with higher mismatch score (sometimes unproductiveness is the result of a really screwed up annotation rather than an actual unproductive sequence). Note that stop codons aren't really indicative of screwed up annotations, so they don't count.
                if self.debug:
                    print '            ...rerunning'
                queries_to_rerun['unproductive'].add(query_name)
                return
            elif self.args.skip_unproductive:
                if self.debug:
                    print '            ...skipping'
                self.unproductive_queries.add(query_name)
                self.remaining_queries.remove(query_name)
                return
            else:
                pass  # this is here so you don't forget that if neither of the above is true, we fall through and add the query to self.info

        # best k_v, k_d:
        k_v = all_query_bounds[best['v']][1]  # end of v match
        k_d = all_query_bounds[best['d']][1] - all_query_bounds[best['v']][1]  # end of d minus end of v

        if k_d_max < 5:  # since the s-w step matches to the longest possible j and then excises it, this sometimes gobbles up the d, resulting in a very short d alignment.
            if self.debug:
                print '  expanding k_d'
            k_d_max = max(8, k_d_max)

        if 'IGHJ4*' in best['j'] and self.glfo['seqs']['d'][best['d']][-5:] == 'ACTAC':  # the end of some d versions is the same as the start of some j versions, so the s-w frequently kicks out the 'wrong' alignment
            if self.debug:
                print '  doubly expanding k_d'
            if k_d_max-k_d_min < 8:
                k_d_min -= 5
                k_d_max += 2

        k_v_min = max(1, k_v_min - self.args.default_v_fuzz)  # ok, so I don't *actually* want it to be zero... oh, well
        k_v_max += self.args.default_v_fuzz
        k_d_min = max(1, k_d_min - self.args.default_d_fuzz)
        k_d_max += self.args.default_d_fuzz
        assert k_v_min > 0 and k_d_min > 0 and k_v_max > 0 and k_d_max > 0

        if self.debug:
            print '         k_v: %d [%d-%d)' % (k_v, k_v_min, k_v_max)
            print '         k_d: %d [%d-%d)' % (k_d, k_d_min, k_d_max)


        kvals = {}
        kvals['v'] = {'best':k_v, 'min':k_v_min, 'max':k_v_max}
        kvals['d'] = {'best':k_d, 'min':k_d_min, 'max':k_d_max}
        self.add_to_info(query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions=codon_positions)