Beispiel #1
0
    def summarize_query(self, query_name, query_seq, all_match_names, all_query_bounds, all_germline_bounds, warnings, first_match_query_bounds):
        if self.debug:
            print '%s' % query_name

        best, match_names, n_matches = {}, {}, {}
        n_used = {'v':0, 'd':0, 'j':0}
        k_v_min, k_d_min = 999, 999
        k_v_max, k_d_max = 0, 0
        for region in utils.regions:
            all_match_names[region] = sorted(all_match_names[region], reverse=True)
            match_names[region] = []
        codon_positions = {'v':-1, 'd':-1, 'j':-1}  # conserved codon positions (v:cysteine, d:dummy, j:tryptophan)
        for region in utils.regions:
            n_matches[region] = len(all_match_names[region])
            n_skipped = 0
            for score, gene in all_match_names[region]:
                glbounds = all_germline_bounds[gene]
                qrbounds = all_query_bounds[gene]
                assert qrbounds[1] <= len(query_seq)  # NOTE I'm putting these up avove as well (in process_query), so in time I should remove them from here
                assert glbounds[1] <= len(self.germline_seqs[region][gene])
                assert qrbounds[0] >= 0
                assert glbounds[0] >= 0
                glmatchseq = self.germline_seqs[region][gene][glbounds[0]:glbounds[1]]

                # TODO since I'm no longer skipping the genes after the first <args.n_max_per_region>, the OR of k-space below is overly conservative

                # only use a specified set of genes
                if self.args.only_genes is not None and gene not in self.args.only_genes:
                    n_skipped += 1
                    continue

                # add match to the list
                n_used[region] += 1
                match_names[region].append(gene)

                self.print_match(region, gene, query_seq, score, glbounds, qrbounds, -1, warnings, skipping=False)

                # if the germline match and the query match aren't the same length, s-w likely added an insert, which we shouldn't get since the gap-open penalty is jacked up so high
                if len(glmatchseq) != len(query_seq[qrbounds[0]:qrbounds[1]]):  # neurotic double check (um, I think) EDIT hey this totally saved my ass
                    print 'ERROR %d not same length' % query_name
                    print glmatchseq, glbounds[0], glbounds[1]
                    print query_seq[qrbounds[0]:qrbounds[1]]
                    assert False

                if region == 'v':
                    this_k_v = all_query_bounds[gene][1]  # NOTE even if the v match doesn't start at the left hand edge of the query sequence, we still measure k_v from there.
                                                          # In other words, sw doesn't tell the hmm about it
                    k_v_min = min(this_k_v, k_v_min)
                    k_v_max = max(this_k_v, k_v_max)
                if region == 'd':
                    this_k_d = all_query_bounds[gene][1] - first_match_query_bounds[1]  # end of d minus end of v
                    k_d_min = min(this_k_d, k_d_min)
                    k_d_max = max(this_k_d, k_d_max)

                # check consistency with best match (since the best match is excised in s-w code, and because ham is run with *one* k_v k_d set)
                if region not in best:
                    best[region] = gene
                    best[region + '_gl_seq'] = self.germline_seqs[region][gene][glbounds[0]:glbounds[1]]
                    best[region + '_qr_seq'] = query_seq[qrbounds[0]:qrbounds[1]]
                    best[region + '_score'] = score

            if self.debug and n_skipped > 0:
                print '%8s skipped %d %s genes' % ('', n_skipped, region)

        for region in utils.regions:
            if region not in best:
                print '      no', region, 'match found for', query_name  # NOTE if no d match found, we should really just assume entire d was eroded
                return

        # s-w allows d and j matches to overlap, so we need to apportion the disputed bases
        try:
            self.shift_overlapping_boundaries(all_query_bounds, all_germline_bounds, query_name, query_seq, best)
        except AssertionError:
            print '%s: apportionment failed' % query_name
            return

        # check for unproductive rearrangements
        for region in utils.regions:
            codon_positions[region] = utils.get_conserved_codon_position(self.cyst_positions, self.tryp_positions, region, best[region], all_germline_bounds[best[region]], all_query_bounds[best[region]], assert_on_fail=False)  # position in the query sequence, that is
        codons_ok = utils.check_both_conserved_codons(query_seq, codon_positions['v'], codon_positions['j'], debug=self.debug, extra_str='      ', assert_on_fail=False)
        cdr3_length = codon_positions['j'] - codon_positions['v'] + 3
        in_frame_cdr3 = (cdr3_length % 3 == 0)
        if self.debug and not in_frame_cdr3:
                print '      out of frame cdr3: %d %% 3 = %d' % (cdr3_length, cdr3_length % 3)
        no_stop_codon = utils.stop_codon_check(query_seq, codon_positions['v'], debug=self.debug)
        if not codons_ok or not in_frame_cdr3 or not no_stop_codon:
            if self.debug:
                print '       unproductive rearrangement in waterer codons_ok: %s   in_frame_cdr3: %s   no_stop_codon: %s' % (codons_ok, in_frame_cdr3, no_stop_codon)
            if self.args.skip_unproductive:
                if self.debug:
                    print '            ...skipping'
                self.n_unproductive += 1
                self.info['skipped_unproductive_queries'].append(query_name)
                return

        # best k_v, k_d:
        k_v = all_query_bounds[best['v']][1]  # end of v match
        k_d = all_query_bounds[best['d']][1] - all_query_bounds[best['v']][1]  # end of d minus end of v

        if k_d_max < 5:  # since the s-w step matches to the longest possible j and then excises it, this sometimes gobbles up the d, resulting in a very short d alignment.
            if self.debug:
                print '  expanding k_d'
            k_d_max = max(8, k_d_max)

        if 'IGHJ4*' in best['j'] and self.germline_seqs['d'][best['d']][-5:] == 'ACTAC':  # the end of some d versions is the same as the start of some j versions, so the s-w frequently kicks out the 'wrong' alignment
            if self.debug:
                print '  doubly expanding k_d'
            if k_d_max-k_d_min < 8:
                k_d_min -= 5
                k_d_max += 2

        k_v_min = max(0, k_v_min - self.args.default_v_fuzz)  # ok, so I don't *actually* want it to be zero... oh, well
        k_v_max += self.args.default_v_fuzz
        k_d_min = max(1, k_d_min - self.args.default_d_fuzz)
        k_d_max += self.args.default_d_fuzz
        assert k_v_min > 0 and k_d_min > 0 and k_v_max > 0 and k_d_max > 0

        if self.debug:
            print '         k_v: %d [%d-%d)' % (k_v, k_v_min, k_v_max)
            print '         k_d: %d [%d-%d)' % (k_d, k_d_min, k_d_max)
            print '         used',
            for region in utils.regions:
                print ' %s: %d/%d' % (region, n_used[region], n_matches[region]),
            print ''


        kvals = {}
        kvals['v'] = {'best':k_v, 'min':k_v_min, 'max':k_v_max}
        kvals['d'] = {'best':k_d, 'min':k_d_min, 'max':k_d_max}
        self.add_to_info(query_name, query_seq, kvals, match_names, best, all_germline_bounds, all_query_bounds, codon_positions=codon_positions)
    def summarize_query(self, query_name, query_seq, all_match_names,
                        all_query_bounds, all_germline_bounds, warnings,
                        first_match_query_bounds):
        if self.debug:
            print '%s' % query_name

        best, match_names, n_matches = {}, {}, {}
        n_used = {'v': 0, 'd': 0, 'j': 0}
        k_v_min, k_d_min = 999, 999
        k_v_max, k_d_max = 0, 0
        for region in utils.regions:
            all_match_names[region] = sorted(all_match_names[region],
                                             reverse=True)
            match_names[region] = []
        codon_positions = {
            'v': -1,
            'd': -1,
            'j': -1
        }  # conserved codon positions (v:cysteine, d:dummy, j:tryptophan)
        for region in utils.regions:
            n_matches[region] = len(all_match_names[region])
            n_skipped = 0
            for score, gene in all_match_names[region]:
                glbounds = all_germline_bounds[gene]
                qrbounds = all_query_bounds[gene]
                assert qrbounds[1] <= len(
                    query_seq
                )  # NOTE I'm putting these up avove as well (in process_query), so in time I should remove them from here
                assert glbounds[1] <= len(self.germline_seqs[region][gene])
                assert qrbounds[0] >= 0
                assert glbounds[0] >= 0
                glmatchseq = self.germline_seqs[region][gene][
                    glbounds[0]:glbounds[1]]

                # TODO since I'm no longer skipping the genes after the first <args.n_max_per_region>, the OR of k-space below is overly conservative

                # only use a specified set of genes
                if self.args.only_genes is not None and gene not in self.args.only_genes:
                    n_skipped += 1
                    continue

                # add match to the list
                n_used[region] += 1
                match_names[region].append(gene)

                self.print_match(region,
                                 gene,
                                 query_seq,
                                 score,
                                 glbounds,
                                 qrbounds,
                                 -1,
                                 warnings,
                                 skipping=False)

                # if the germline match and the query match aren't the same length, s-w likely added an insert, which we shouldn't get since the gap-open penalty is jacked up so high
                if len(glmatchseq) != len(
                        query_seq[qrbounds[0]:qrbounds[1]]
                ):  # neurotic double check (um, I think) EDIT hey this totally saved my ass
                    print 'ERROR %d not same length' % query_name
                    print glmatchseq, glbounds[0], glbounds[1]
                    print query_seq[qrbounds[0]:qrbounds[1]]
                    assert False

                if region == 'v':
                    this_k_v = all_query_bounds[gene][
                        1]  # NOTE even if the v match doesn't start at the left hand edge of the query sequence, we still measure k_v from there.
                    # In other words, sw doesn't tell the hmm about it
                    k_v_min = min(this_k_v, k_v_min)
                    k_v_max = max(this_k_v, k_v_max)
                if region == 'd':
                    this_k_d = all_query_bounds[gene][
                        1] - first_match_query_bounds[
                            1]  # end of d minus end of v
                    k_d_min = min(this_k_d, k_d_min)
                    k_d_max = max(this_k_d, k_d_max)

                # check consistency with best match (since the best match is excised in s-w code, and because ham is run with *one* k_v k_d set)
                if region not in best:
                    best[region] = gene
                    best[region + '_gl_seq'] = self.germline_seqs[region][
                        gene][glbounds[0]:glbounds[1]]
                    best[region +
                         '_qr_seq'] = query_seq[qrbounds[0]:qrbounds[1]]
                    best[region + '_score'] = score

            if self.debug and n_skipped > 0:
                print '%8s skipped %d %s genes' % ('', n_skipped, region)

        for region in utils.regions:
            if region not in best:
                print '      no', region, 'match found for', query_name  # NOTE if no d match found, we should really just assume entire d was eroded
                return

        # s-w allows d and j matches to overlap, so we need to apportion the disputed bases
        try:
            self.shift_overlapping_boundaries(all_query_bounds,
                                              all_germline_bounds, query_name,
                                              query_seq, best)
        except AssertionError:
            print '%s: apportionment failed' % query_name
            return

        # check for unproductive rearrangements
        for region in utils.regions:
            codon_positions[region] = utils.get_conserved_codon_position(
                self.cyst_positions,
                self.tryp_positions,
                region,
                best[region],
                all_germline_bounds[best[region]],
                all_query_bounds[best[region]],
                assert_on_fail=False
            )  # position in the query sequence, that is
        codons_ok = utils.check_both_conserved_codons(query_seq,
                                                      codon_positions['v'],
                                                      codon_positions['j'],
                                                      debug=self.debug,
                                                      extra_str='      ',
                                                      assert_on_fail=False)
        cdr3_length = codon_positions['j'] - codon_positions['v'] + 3
        in_frame_cdr3 = (cdr3_length % 3 == 0)
        if self.debug and not in_frame_cdr3:
            print '      out of frame cdr3: %d %% 3 = %d' % (cdr3_length,
                                                             cdr3_length % 3)
        no_stop_codon = utils.stop_codon_check(query_seq,
                                               codon_positions['v'],
                                               debug=self.debug)
        if not codons_ok or not in_frame_cdr3 or not no_stop_codon:
            if self.debug:
                print '       unproductive rearrangement in waterer codons_ok: %s   in_frame_cdr3: %s   no_stop_codon: %s' % (
                    codons_ok, in_frame_cdr3, no_stop_codon)
            if self.args.skip_unproductive:
                if self.debug:
                    print '            ...skipping'
                self.n_unproductive += 1
                self.info['skipped_unproductive_queries'].append(query_name)
                return

        # best k_v, k_d:
        k_v = all_query_bounds[best['v']][1]  # end of v match
        k_d = all_query_bounds[best['d']][1] - all_query_bounds[best['v']][
            1]  # end of d minus end of v

        if k_d_max < 5:  # since the s-w step matches to the longest possible j and then excises it, this sometimes gobbles up the d, resulting in a very short d alignment.
            if self.debug:
                print '  expanding k_d'
            k_d_max = max(8, k_d_max)

        if 'IGHJ4*' in best['j'] and self.germline_seqs['d'][best['d']][
                -5:] == 'ACTAC':  # the end of some d versions is the same as the start of some j versions, so the s-w frequently kicks out the 'wrong' alignment
            if self.debug:
                print '  doubly expanding k_d'
            if k_d_max - k_d_min < 8:
                k_d_min -= 5
                k_d_max += 2

        k_v_min = max(
            0, k_v_min - self.args.default_v_fuzz
        )  # ok, so I don't *actually* want it to be zero... oh, well
        k_v_max += self.args.default_v_fuzz
        k_d_min = max(1, k_d_min - self.args.default_d_fuzz)
        k_d_max += self.args.default_d_fuzz
        assert k_v_min > 0 and k_d_min > 0 and k_v_max > 0 and k_d_max > 0

        if self.debug:
            print '         k_v: %d [%d-%d)' % (k_v, k_v_min, k_v_max)
            print '         k_d: %d [%d-%d)' % (k_d, k_d_min, k_d_max)
            print '         used',
            for region in utils.regions:
                print ' %s: %d/%d' % (region, n_used[region],
                                      n_matches[region]),
            print ''

        kvals = {}
        kvals['v'] = {'best': k_v, 'min': k_v_min, 'max': k_v_max}
        kvals['d'] = {'best': k_d, 'min': k_d_min, 'max': k_d_max}
        self.add_to_info(query_name,
                         query_seq,
                         kvals,
                         match_names,
                         best,
                         all_germline_bounds,
                         all_query_bounds,
                         codon_positions=codon_positions)
Beispiel #3
0
    def process_query(self, bam, reads):
        primary = next((r for r in reads if not r.is_secondary), None)
        query_seq = primary.seq
        query_name = primary.qname
        first_match_query_bounds = None  # since sw excises its favorite v match, we have to know this match's boundaries in order to calculate k_d for all the other matches
        all_match_names = {}
        warnings = {}  # ick, this is a messy way to pass stuff around
        for region in utils.regions:
            all_match_names[region] = []
        all_query_bounds, all_germline_bounds = {}, {}
        n_skipped_invalid_cpos = 0
        for read in reads:  # loop over the matches found for each query sequence
            # set this match's values
            read.seq = query_seq  # only the first one has read.seq set by default, so we need to set the rest by hand
            gene = bam.references[read.tid]
            region = utils.get_region(gene)
            raw_score = read.tags[0][1]  # raw because they don't include the gene choice probs
            score = raw_score
            if self.args.apply_choice_probs_in_sw:  # NOTE I stopped applying the gene choice probs here because the smith-waterman scores don't correspond to log-probs, so throwing on the gene choice probs was dubious (and didn't seem to work that well)
                score = self.get_choice_prob(region, gene) * raw_score  # multiply by the probability to choose this gene
            qrbounds = (read.qstart, read.qend)
            glbounds = (read.pos, read.aend)
            if region == 'v' and first_match_query_bounds is None:
                first_match_query_bounds = qrbounds

            # perform a few checks and see if we want to skip this match
            if region == 'v':  # skip matches with cpos past the end of the query seq (i.e. eroded a ton on the right side of the v)
                cpos = utils.get_conserved_codon_position(self.cyst_positions, self.tryp_positions, 'v', gene, glbounds, qrbounds, assert_on_fail=False)
                if not utils.check_conserved_cysteine(self.germline_seqs['v'][gene], self.cyst_positions[gene]['cysteine-position'], assert_on_fail=False):  # some of the damn cysteine positions in the json file were wrong, so now we check
                    raise Exception('bad cysteine in %s: %d %s' % (gene, self.cyst_positions[gene]['cysteine-position'], self.germline_seqs['v'][gene]))
                if cpos < 0 or cpos >= len(query_seq):
                    n_skipped_invalid_cpos += 1
                    continue

            if 'I' in read.cigarstring or 'D' in read.cigarstring:  # skip indels, and tell the HMM to skip indels (you won't see any unless you decrease the <self.args.gap_open_penalty>)
                if len(all_match_names[region]) == 0:  # if this is the first (best) match for this region, allow indels (otherwise skip the match)
                    if query_name not in self.info['indels']:
                        self.info['indels'][query_name] = self.get_indel_info(query_name, read.cigarstring, query_seq[qrbounds[0] : qrbounds[1]], self.germline_seqs[region][gene][glbounds[0] : glbounds[1]], gene)
                        self.info['indels'][query_name]['reversed_seq'] = query_seq[ : qrbounds[0]] + self.info['indels'][query_name]['reversed_seq'] + query_seq[qrbounds[1] : ]
                        self.new_indels += 1
                        # print ' query seq  %s' % query_seq
                        # print 'indelfo seq %s' % self.info['indels'][query_name]['reversed_seq']
                        # self.info['skipped_indel_queries'].append(query_name)
                        # self.info[query_name] = {'indels'}
                    else:
                        print '     multiple indels for %s' % query_name
                    return
                else:
                    continue

            if qrbounds[1]-qrbounds[0] != glbounds[1]-glbounds[0]:
                raise Exception('germline match (%d %d) not same length as query match (%d %d)' % (qrbounds[0], qrbounds[1], glbounds[0], glbounds[1]))

            assert qrbounds[1] <= len(query_seq)
            if glbounds[1] > len(self.germline_seqs[region][gene]):
                print '  ', gene
                print '  ', glbounds[1], len(self.germline_seqs[region][gene])
                print '  ', self.germline_seqs[region][gene]
            assert glbounds[1] <= len(self.germline_seqs[region][gene])
            assert qrbounds[1]-qrbounds[0] == glbounds[1]-glbounds[0]

            # and finally add this match's information
            warnings[gene] = ''
            all_match_names[region].append((score, gene))  # NOTE it is important that this is ordered such that the best match is first
            all_query_bounds[gene] = qrbounds
            all_germline_bounds[gene] = glbounds

        # if n_skipped_invalid_cpos > 0:
        #     print '      skipped %d invalid cpos values for %s' % (n_skipped_invalid_cpos, query_name)
        self.summarize_query(query_name, query_seq, all_match_names, all_query_bounds, all_germline_bounds, warnings, first_match_query_bounds)
    def process_query(self, bam, reads):
        primary = next((r for r in reads if not r.is_secondary), None)
        query_seq = primary.seq
        query_name = primary.qname
        first_match_query_bounds = None  # since sw excises its favorite v match, we have to know this match's boundaries in order to calculate k_d for all the other matches
        all_match_names = {}
        warnings = {}  # ick, this is a messy way to pass stuff around
        for region in utils.regions:
            all_match_names[region] = []
        all_query_bounds, all_germline_bounds = {}, {}
        n_skipped_invalid_cpos = 0
        for read in reads:  # loop over the matches found for each query sequence
            # set this match's values
            read.seq = query_seq  # only the first one has read.seq set by default, so we need to set the rest by hand
            gene = bam.references[read.tid]
            region = utils.get_region(gene)
            raw_score = read.tags[0][
                1]  # raw because they don't include the gene choice probs
            score = raw_score
            if self.args.apply_choice_probs_in_sw:  # NOTE I stopped applying the gene choice probs here because the smith-waterman scores don't correspond to log-probs, so throwing on the gene choice probs was dubious (and didn't seem to work that well)
                score = self.get_choice_prob(
                    region, gene
                ) * raw_score  # multiply by the probability to choose this gene
            qrbounds = (read.qstart, read.qend)
            glbounds = (read.pos, read.aend)
            if region == 'v' and first_match_query_bounds is None:
                first_match_query_bounds = qrbounds

            # perform a few checks and see if we want to skip this match
            if region == 'v':  # skip matches with cpos past the end of the query seq (i.e. eroded a ton on the right side of the v)
                cpos = utils.get_conserved_codon_position(self.cyst_positions,
                                                          self.tryp_positions,
                                                          'v',
                                                          gene,
                                                          glbounds,
                                                          qrbounds,
                                                          assert_on_fail=False)
                if not utils.check_conserved_cysteine(
                        self.germline_seqs['v'][gene],
                        self.cyst_positions[gene]['cysteine-position'],
                        assert_on_fail=False
                ):  # some of the damn cysteine positions in the json file were wrong, so now we check
                    raise Exception(
                        'bad cysteine in %s: %d %s' %
                        (gene, self.cyst_positions[gene]['cysteine-position'],
                         self.germline_seqs['v'][gene]))
                if cpos < 0 or cpos >= len(query_seq):
                    n_skipped_invalid_cpos += 1
                    continue

            if 'I' in read.cigarstring or 'D' in read.cigarstring:  # skip indels, and tell the HMM to skip indels (you won't see any unless you decrease the <self.args.gap_open_penalty>)
                if len(
                        all_match_names[region]
                ) == 0:  # if this is the first (best) match for this region, allow indels (otherwise skip the match)
                    if query_name not in self.info['indels']:
                        self.info['indels'][query_name] = self.get_indel_info(
                            query_name, read.cigarstring,
                            query_seq[qrbounds[0]:qrbounds[1]],
                            self.germline_seqs[region][gene]
                            [glbounds[0]:glbounds[1]], gene)
                        self.info['indels'][query_name][
                            'reversed_seq'] = query_seq[:qrbounds[
                                0]] + self.info['indels'][query_name][
                                    'reversed_seq'] + query_seq[qrbounds[1]:]
                        self.new_indels += 1
                        # print ' query seq  %s' % query_seq
                        # print 'indelfo seq %s' % self.info['indels'][query_name]['reversed_seq']
                        # self.info['skipped_indel_queries'].append(query_name)
                        # self.info[query_name] = {'indels'}
                    else:
                        print '     multiple indels for %s' % query_name
                    return
                else:
                    continue

            if qrbounds[1] - qrbounds[0] != glbounds[1] - glbounds[0]:
                raise Exception(
                    'germline match (%d %d) not same length as query match (%d %d)'
                    % (qrbounds[0], qrbounds[1], glbounds[0], glbounds[1]))

            assert qrbounds[1] <= len(query_seq)
            if glbounds[1] > len(self.germline_seqs[region][gene]):
                print '  ', gene
                print '  ', glbounds[1], len(self.germline_seqs[region][gene])
                print '  ', self.germline_seqs[region][gene]
            assert glbounds[1] <= len(self.germline_seqs[region][gene])
            assert qrbounds[1] - qrbounds[0] == glbounds[1] - glbounds[0]

            # and finally add this match's information
            warnings[gene] = ''
            all_match_names[region].append(
                (score, gene)
            )  # NOTE it is important that this is ordered such that the best match is first
            all_query_bounds[gene] = qrbounds
            all_germline_bounds[gene] = glbounds

        # if n_skipped_invalid_cpos > 0:
        #     print '      skipped %d invalid cpos values for %s' % (n_skipped_invalid_cpos, query_name)
        self.summarize_query(query_name, query_seq, all_match_names,
                             all_query_bounds, all_germline_bounds, warnings,
                             first_match_query_bounds)