コード例 #1
0
    def hamming_distance_to_true_naive(self, true_line, line, query_name, restrict_to_region='', normalize=False, padfo=None, debug=False):
        """
        Hamming distance between the inferred naive sequence and the tue naive sequence.
        <restrict_to_region> if set, restrict the comparison to the section of the *true* sequence assigned to the given region.
        NOTE this will not in general correspond to the similarly-assigned region in the inferred naive sequence.
        if <normalize> divide by sequence length
        """

        true_naive_seq = utils.get_full_naive_seq(self.germlines, true_line)
        inferred_naive_seq = utils.get_full_naive_seq(self.germlines, line)

        left_hack_add_on = ''
        right_hack_add_on = ''
        if len(true_line['seq']) > len(line['seq']):  # ihhhmmm doesn't report the bits of the sequence it erodes off the ends, so we have to add them back on
        # if len(true_naive_seq) > len(inferred_naive_seq):  # hm, now why did I use line['seq'] stuff before?
            start = true_line['seq'].find(line['seq'])
            assert start >= 0
            end = len(line['seq']) + start
            left_hack_add_on = true_line['seq'][: start]
            right_hack_add_on = true_line['seq'][ end :]
            # extra_penalty = len(left_hack_add_on) + len(right_hack_add_on)
            inferred_naive_seq = 'N'*len(left_hack_add_on) + inferred_naive_seq + 'N'*len(right_hack_add_on)
            if debug:
                print '  adding to inferred naive seq'

        # if restrict_to_region == '':
        #     print '  before', inferred_naive_seq
        if padfo is not None:  # remove N padding from the inferred sequence
            inferred_naive_seq = inferred_naive_seq[padfo['padleft'] : ]
            if padfo['padright'] > 0:
                inferred_naive_seq = inferred_naive_seq[ : -padfo['padright']]
        # if restrict_to_region == '':
        #     print '  after ', inferred_naive_seq

        bounds = None
        if restrict_to_region != '':
            bounds = utils.get_regional_naive_seq_bounds(restrict_to_region, self.germlines, true_line)  # get the bounds of this *true* region
            true_naive_seq = true_naive_seq[bounds[0] : bounds[1]]
            inferred_naive_seq = inferred_naive_seq[bounds[0] : bounds[1]]

        if debug:
            print restrict_to_region, 'region, bounds', bounds
            print '  true ', true_naive_seq
            print '  infer', inferred_naive_seq

        if len(true_naive_seq) != len(inferred_naive_seq):
            raise Exception('still not the same lengths for %s\n  %s\n  %s' % (query_name, true_naive_seq, inferred_naive_seq))
        fraction, len_excluding_ambig = utils.hamming_fraction(true_naive_seq, inferred_naive_seq, return_len_excluding_ambig=True)
        total_distance = int(fraction * len_excluding_ambig)
        if len(true_naive_seq) == 0:
            print 'WARNING zero length sequence in hamming_distance_to_true_naive'
            return 0
        if normalize:
            return int(100 * (float(total_distance) / len(true_naive_seq)))
        else:
            return total_distance
コード例 #2
0
    def hamming_distance_to_true_naive(self, true_line, line, query_name, restrict_to_region='', normalize=False, debug=False):
        """
        Hamming distance between the inferred naive sequence and the tue naive sequence.
        <restrict_to_region> if set, restrict the comparison to the section of the *true* sequence assigned to the given region.
        NOTE this will not in general correspond to the similarly-assigned region in the inferred naive sequence.
        if <normalize> divide by sequence length
        """

        true_naive_seq = utils.get_full_naive_seq(self.germlines, true_line)
        inferred_naive_seq = utils.get_full_naive_seq(self.germlines, line)

        left_hack_add_on = ''
        right_hack_add_on = ''
        if len(true_line['seq']) > len(line['seq']):  # ihhhmmm doesn't report the bits of the sequence it erodes off the ends, so we have to add them back on
        # if len(true_naive_seq) > len(inferred_naive_seq):  # hm, now why I did use line['seq'] stuff before?
            start = true_line['seq'].find(line['seq'])
            assert start >= 0
            end = len(line['seq']) + start
            left_hack_add_on = true_line['seq'][: start]
            right_hack_add_on = true_line['seq'][ end :]
            # extra_penalty = len(left_hack_add_on) + len(right_hack_add_on)
            inferred_naive_seq = 'x'*len(left_hack_add_on) + inferred_naive_seq + 'x'*len(right_hack_add_on)
            if debug:
                print '  adding to inferred naive seq'

        bounds = None
        if restrict_to_region != '':
            bounds = utils.get_regional_naive_seq_bounds(restrict_to_region, self.germlines, true_line)  # get the bounds of this *true* region
            true_naive_seq = true_naive_seq[bounds[0] : bounds[1]]
            inferred_naive_seq = inferred_naive_seq[bounds[0] : bounds[1]]


        # if len(true_naive_seq) > len(inferred_naive_seq):
            

        if debug:
            print restrict_to_region, 'region, bounds', bounds
            print '  true ', true_naive_seq
            print '  infer', inferred_naive_seq

        if len(true_naive_seq) != len(inferred_naive_seq):
            print 'ERROR still not the same lengths for %s' % query_name
            print '  true ', true_naive_seq
            print '  infer', inferred_naive_seq
            sys.exit()
        total_distance = utils.hamming(true_naive_seq, inferred_naive_seq)
        if len(true_naive_seq) == 0:
            print 'WARNING zero length sequence in hamming_distance_to_true_naive'
            return 0
        if normalize:
            return int(100 * (float(total_distance) / len(true_naive_seq)))
        else:
            return total_distance
コード例 #3
0
def resolve_overlapping_matches(line, debug=False, germlines=None):
    """
    joinsolver allows d and j matches (and v and d matches) to overlap... which makes no sense, so
    arbitrarily split the disputed territory in two.
    """
    # NOTE this does pretty much the same thing as a function in waterer
    for rpairs in ({'left': 'v', 'right': 'd'}, {'left': 'd', 'right': 'j'}):
        left_gene = line[rpairs['left'] + '_gene']
        right_gene = line[rpairs['right'] + '_gene']
        l_qr_seq = line[rpairs['left'] + '_qr_seq']
        r_qr_seq = line[rpairs['right'] + '_qr_seq']
        all_l_matches = re.findall(l_qr_seq, line['seq'])
        all_r_matches = re.findall(r_qr_seq, line['seq'])
        lpos = line['seq'].find(l_qr_seq)  # find the *first* occurences
        rpos = line['seq'].find(r_qr_seq)  #   of l_qr_seq and r_qr_seq
        left_match_end = lpos + len(
            l_qr_seq)  # base after the last left-gene-matched base
        right_match_start = rpos  # first base of right-hand match
        overlap = left_match_end - right_match_start
        if len(all_l_matches) > 1 or len(
                all_r_matches
        ) > 1:  # darn, the match occurs more than once, so we have to figure out which one to use. Choose the one that gives the smallest overlap
            min_delta = 9999
            lpos, rpos = -1, -1
            for il in range(
                    len(all_l_matches)
            ):  # loop over all the combinations to find the smallest difference
                lpos = line['seq'].find(l_qr_seq, lpos + 1)
                for ir in range(len(all_r_matches)):
                    rpos = line['seq'].find(r_qr_seq, rpos + 1)
                    if overlap > lpos + len(l_qr_seq) - rpos:
                        overlap = lpos + len(l_qr_seq) - rpos
            if len(all_l_matches) > 1:
                if debug:
                    print '    WARNING %d occurences of %s in %s' % (
                        len(all_l_matches), l_qr_seq, line['seq'])
            if len(all_r_matches) > 1:
                if debug:
                    print '    WARNING %d occurences of %s in %s' % (
                        len(all_r_matches), r_qr_seq, line['seq'])
        else:
            try:
                assert len(all_l_matches) == 1 and len(all_r_matches) == 1
            except:
                if debug:
                    print '    all_l_matches %d all_r_matches %d' % (
                        len(all_l_matches), len(all_r_matches))
                assert False
        if overlap > 0:
            lefthand_portion = int(math.floor(overlap / 2.0))
            righthand_portion = int(math.ceil(overlap / 2.0))
            if debug:
                print '     WARNING %s apportioning %d overlapping bases between %s (%d) and %s (%d) matches' % (
                    line['unique_id'], overlap, rpairs['left'],
                    lefthand_portion, rpairs['right'], righthand_portion)
            assert lefthand_portion <= len(line[rpairs['left'] + '_gl_seq'])
            assert righthand_portion <= len(line[rpairs['right'] + '_gl_seq'])

            if lefthand_portion > 0:  # slicing doesn't 'work' with zeros
                line[rpairs['left'] +
                     '_gl_seq'] = line[rpairs['left'] +
                                       '_gl_seq'][:-lefthand_portion]
                line[rpairs['left'] + '_qr_seq'] = l_qr_seq[:-lefthand_portion]
                line[rpairs['left'] + '_3p_del'] += lefthand_portion

            line[rpairs['right'] +
                 '_gl_seq'] = line[rpairs['right'] +
                                   '_gl_seq'][righthand_portion:]
            line[rpairs['right'] + '_qr_seq'] = r_qr_seq[righthand_portion:]
            line[rpairs['right'] + '_5p_del'] += righthand_portion
        else:
            if debug:
                print '    no %s overlap, not doing anything' % (
                    rpairs['left'] + rpairs['right'])

    naive_seq = utils.get_full_naive_seq(germlines, line)
    muted_seq = line['seq']

    if len(naive_seq) != len(
            muted_seq
    ):  # this will happen if, for instance, there's a really short D match and there's NO F*****G WAY to figure out where it was really supposed to be
        if debug:
            print '    unequal lengths:'
            print '        ', naive_seq
            print '        ', muted_seq
        assert False
コード例 #4
0
    def hamming_distance_to_true_naive(self, true_line, line, query_name, restrict_to_region='', normalize=False, padfo=None, debug=False):
        """
        Hamming distance between the inferred naive sequence and the tue naive sequence.
        <restrict_to_region> if set, restrict the comparison to the section of the *true* sequence assigned to the given region.
        NOTE this will not in general correspond to the similarly-assigned region in the inferred naive sequence.
        if <normalize> divide by sequence length
        """

        true_naive_seq = utils.get_full_naive_seq(self.germlines, true_line)
        inferred_naive_seq = utils.get_full_naive_seq(self.germlines, line)
        if len(true_naive_seq) != len(inferred_naive_seq):
            print '%20s    true      inf' % ''
            for k in true_line:
                print '%20s   %s' % (k, true_line[k]),
                if k in line:
                    print '   %s' % line[k]
                else:
                    print '    NOPE'
            for k in line:
                if k not in true_line:
                    print '  not in true line   %20s    %s' % (k, line[k])
            raise Exception('%s true and inferred sequences not the same length\n   %s\n   %s\n' % (line['unique_id'], true_naive_seq, inferred_naive_seq))

        # assert False # read through this whole damn thing and make sure it's ok

        left_hack_add_on = ''
        right_hack_add_on = ''
        # if len(true_line['seq']) > len(utils.remove_ambiguous_ends(line['seq'], line['fv_insertion'], line['jf_insertion'])):  # ihhhmmm doesn't report the bits of the sequence it erodes off the ends, so we have to add them back on
        # # if len(true_naive_seq) > len(inferred_naive_seq):  # hm, now why did I use line['seq'] stuff before?
        #     assert False
        #     start = true_line['seq'].find(line['seq'])
        #     assert start >= 0
        #     end = len(line['seq']) + start
        #     left_hack_add_on = true_line['seq'][: start]
        #     right_hack_add_on = true_line['seq'][ end :]
        #     # extra_penalty = len(left_hack_add_on) + len(right_hack_add_on)
        #     inferred_naive_seq = 'N'*len(left_hack_add_on) + inferred_naive_seq + 'N'*len(right_hack_add_on)
        #     if debug:
        #         print '  adding to inferred naive seq'


        if padfo is not None:  # remove N padding from the inferred sequence
            if debug:
                print 'removing padfo'
                print inferred_naive_seq
            if inferred_naive_seq[padfo['padleft'] : ].count('N') == padfo['padleft']:  # this fails to happen if reset_effective_erosions_and_effective_insertions already removed the Ns
                inferred_naive_seq = inferred_naive_seq[padfo['padleft'] : ]
            elif debug:  # NOTE if no debug, we just fall through, which isok
                print 'tried to remove non Ns!\n   %s\n   padleft %d\n' % (inferred_naive_seq, padfo['padleft'])
            if padfo['padright'] > 0:
                if inferred_naive_seq[ : padfo['padright']].count('N') == padfo['padright']:  # this fails to happen if reset_effective_erosions_and_effective_insertions already removed the Ns
                    inferred_naive_seq = inferred_naive_seq[ : -padfo['padright']]
                elif debug:  # NOTE if no debug, we just fall through, which isok
                    print 'tried to remove non Ns!\n   %s\n   padright %d\n' % (inferred_naive_seq, padfo['padright'])
            if debug:
                print padfo['padleft'] * ' ' + inferred_naive_seq + padfo['padleft'] * ' '

        bounds = None
        if restrict_to_region != '':
            bounds = utils.get_regional_naive_seq_bounds(restrict_to_region, self.germlines, true_line)  # get the bounds of this *true* region
            if debug:
                print 'restrict to %s' % restrict_to_region
                utils.color_mutants(true_naive_seq, inferred_naive_seq, print_result=True, extra_str='      ')
                utils.color_mutants(true_naive_seq[bounds[0] : bounds[1]], inferred_naive_seq[bounds[0] : bounds[1]], print_result=True, extra_str='      ' + bounds[0]*' ')
            true_naive_seq = true_naive_seq[bounds[0] : bounds[1]]
            inferred_naive_seq = inferred_naive_seq[bounds[0] : bounds[1]]

        if len(true_naive_seq) != len(inferred_naive_seq):
            raise Exception('still not the same lengths for %s\n  %s\n  %s' % (query_name, true_naive_seq, inferred_naive_seq))
        fraction, len_excluding_ambig = utils.hamming_fraction(true_naive_seq, inferred_naive_seq, return_len_excluding_ambig=True)
        total_distance = int(fraction * len_excluding_ambig)
        if len(true_naive_seq) == 0:
            print 'WARNING zero length sequence in hamming_distance_to_true_naive'
            return 0
        if normalize:
            return int(100 * (float(total_distance) / len(true_naive_seq)))
        else:
            return total_distance
コード例 #5
0
ファイル: joinparser.py プロジェクト: Irrationone/partis
def resolve_overlapping_matches(line, debug=False, germlines=None):
    """
    joinsolver allows d and j matches (and v and d matches) to overlap... which makes no sense, so
    arbitrarily split the disputed territory in two.
    """
    # NOTE this does pretty much the same thing as a function in waterer
    for rpairs in ({'left':'v', 'right':'d'}, {'left':'d', 'right':'j'}):
        left_gene = line[rpairs['left'] + '_gene']
        right_gene = line[rpairs['right'] + '_gene']
        l_qr_seq = line[rpairs['left'] + '_qr_seq']
        r_qr_seq = line[rpairs['right'] + '_qr_seq']
        all_l_matches = re.findall(l_qr_seq, line['seq'])
        all_r_matches = re.findall(r_qr_seq, line['seq'])
        lpos = line['seq'].find(l_qr_seq)  # find the *first* occurences
        rpos = line['seq'].find(r_qr_seq)  #   of l_qr_seq and r_qr_seq
        left_match_end = lpos + len(l_qr_seq)  # base after the last left-gene-matched base
        right_match_start = rpos  # first base of right-hand match
        overlap = left_match_end - right_match_start
        if len(all_l_matches) > 1 or len(all_r_matches) > 1:  # darn, the match occurs more than once, so we have to figure out which one to use. Choose the one that gives the smallest overlap
            min_delta = 9999
            lpos, rpos = -1, -1
            for il in range(len(all_l_matches)):  # loop over all the combinations to find the smallest difference
                lpos = line['seq'].find(l_qr_seq, lpos + 1)
                for ir in range(len(all_r_matches)):
                    rpos = line['seq'].find(r_qr_seq, rpos + 1)
                    if overlap > lpos + len(l_qr_seq) - rpos:
                        overlap = lpos + len(l_qr_seq) - rpos
            if len(all_l_matches) > 1:
                if debug:
                    print '    WARNING %d occurences of %s in %s' % (len(all_l_matches), l_qr_seq, line['seq'])
            if len(all_r_matches) > 1:
                if debug:
                    print '    WARNING %d occurences of %s in %s' % (len(all_r_matches), r_qr_seq, line['seq'])
        else:
            try:
                assert len(all_l_matches) == 1 and len(all_r_matches) == 1
            except:
                if debug:
                    print '    all_l_matches %d all_r_matches %d' % (len(all_l_matches), len(all_r_matches))
                assert False
        if overlap > 0:
            lefthand_portion = int(math.floor(overlap / 2.0))
            righthand_portion = int(math.ceil(overlap / 2.0))
            if debug:
                print '     WARNING %s apportioning %d overlapping bases between %s (%d) and %s (%d) matches' % (line['unique_id'], overlap, rpairs['left'], lefthand_portion, rpairs['right'], righthand_portion)
            assert lefthand_portion <= len(line[rpairs['left'] + '_gl_seq'])
            assert righthand_portion <= len(line[rpairs['right'] + '_gl_seq'])

            if lefthand_portion > 0:  # slicing doesn't 'work' with zeros
                line[rpairs['left'] + '_gl_seq'] = line[rpairs['left'] + '_gl_seq'][:-lefthand_portion]
                line[rpairs['left'] + '_qr_seq'] = l_qr_seq[:-lefthand_portion]
                line[rpairs['left'] + '_3p_del'] += lefthand_portion

            line[rpairs['right'] + '_gl_seq'] = line[rpairs['right'] + '_gl_seq'][righthand_portion:]
            line[rpairs['right'] + '_qr_seq'] = r_qr_seq[righthand_portion:]
            line[rpairs['right'] + '_5p_del'] += righthand_portion
        else:
            if debug:
                print '    no %s overlap, not doing anything' % (rpairs['left'] + rpairs['right'])

    naive_seq = utils.get_full_naive_seq(germlines, line)
    muted_seq = line['seq']
        
    if len(naive_seq) != len(muted_seq):  # this will happen if, for instance, there's a really short D match and there's NO F*****G WAY to figure out where it was really supposed to be
        if debug:
            print '    unequal lengths:'
            print '        ', naive_seq
            print '        ', muted_seq
        assert False