コード例 #1
0
def add_single_indel(indelfo, pos, length, gapped_codon_positions, keep_in_frame=False, debug=False):
    ifo = {'type' : None, 'pos' : pos, 'len' : length, 'seqstr' : None}
    if numpy.random.uniform(0, 1) < 0.5:  # fifty-fifty chance of insertion and deletion
        ifo['type'] = 'insertion'
        ifo['seqstr'] = ''.join([utils.nukes[random.randint(0, len(utils.nukes) - 1)] for _ in range(length)])
        if utils.gap_len(ifo['seqstr']) > 0:  # this is a backup for the uncommon cases where overlaps() in the calling fcn doesn't catch something
            print '  failed adding indel (overlaps with previous one)'
            return
        indelfo['qr_gap_seq'] = indelfo['qr_gap_seq'][:pos] + ifo['seqstr'] + indelfo['qr_gap_seq'][pos:]
        indelfo['gl_gap_seq'] = indelfo['gl_gap_seq'][:pos] + length * utils.gap_chars[0] + indelfo['gl_gap_seq'][pos:]
        for region in gapped_codon_positions:
            if pos < gapped_codon_positions[region]:  # this isn\'t right if the indel is actually in the codon, but in that case we just let the messed up codon through below
                gapped_codon_positions[region] += length
        for otherfo in indelfo['indels']:  # correct the positions of any existing indels that're to the right of this one
            if otherfo['pos'] > pos:
                otherfo['pos'] += ifo['len']
    else:
        ifo['type'] = 'deletion'
        ifo['seqstr'] = indelfo['gl_gap_seq'][pos : pos + length]  # NOTE it's kind of unclear whether this should be the bit in the qr or gl seq. Using the gl like this probably makes more sense, since it corresponds to what we would infer in s-w (i.e., if we _do_ delete some SHMd positions, we will never know about it, so who cares)
        if utils.gap_len(ifo['seqstr']) > 0:  # this is a backup for the uncommon cases where overlaps() in the calling fcn doesn't catch something
            print '  failed adding indel (overlaps with previous one)'
            return
        indelfo['qr_gap_seq'] = indelfo['qr_gap_seq'][:pos] + length * utils.gap_chars[0] + indelfo['qr_gap_seq'][pos + length : ]

    if not utils.codon_unmutated('cyst', indelfo['qr_gap_seq'], gapped_codon_positions['v']):
        if debug:
            print '  adding indel within %s codon' % 'cyst'

    indelfo['indels'].append(ifo)
    indelfo['indels'] = sorted(indelfo['indels'], key=lambda q: q['pos'])

    if debug:
        print get_dbg_str(indelfo)
コード例 #2
0
def generate_snpd_gene(gene, cpos, seq, positions):
    assert utils.get_region(gene) == 'v'  # others not yet handled
    def choose_position():
        snp_pos = None
        while snp_pos is None or snp_pos in snpd_positions or not utils.codon_unmutated('cyst', tmpseq, cpos, debug=True):
            snp_pos = random.randint(0, len(seq) - 1)  # note that randint() is inclusive
            tmpseq = seq[: snp_pos] + 'X' + seq[snp_pos + 1 :]  # for checking cyst position
        return snp_pos

    snpd_positions = set()  # only used if a position wasn't specified (i.e. was None) in <snps_to_add>
    mutfo = OrderedDict()
    for snp_pos in positions:
        if snp_pos is None:
            snp_pos = choose_position()
        snpd_positions.add(snp_pos)
        new_base = None
        while new_base is None or new_base == seq[snp_pos]:
            new_base = utils.nukes[random.randint(0, len(utils.nukes) - 1)]
        print '        %3d   %s --> %s' % (snp_pos, seq[snp_pos], new_base)
        mutfo[snp_pos] = {'original' : seq[snp_pos], 'new' : new_base}

        seq = seq[: snp_pos] + new_base + seq[snp_pos + 1 :]

    assert utils.codon_unmutated('cyst', seq, cpos, debug=True)  # this is probably unnecessary
    snpd_name, mutfo = get_new_allele_name_and_change_mutfo(gene, mutfo)
    return {'template-gene' : gene, 'gene' : snpd_name, 'seq' : seq}
コード例 #3
0
def remove_v_genes_with_bad_cysteines(glfo, debug=False):
    prelength = len(glfo['seqs']['v'])
    for gene in glfo['seqs']['v'].keys():  # have to use a copy of the keys, since we modify the dict in the loop
        mutated = not utils.codon_unmutated('cyst', glfo['seqs']['v'][gene], glfo['cyst-positions'][gene])
        in_frame = utils.in_frame_germline_v(glfo['seqs']['v'][gene], glfo['cyst-positions'][gene])
        if mutated or not in_frame:
            remove_gene(glfo, gene, debug=debug)
    if True:  # debug:
        print '  removed %d / %d v genes with bad cysteines' % (prelength - len(glfo['seqs']['v']), len(glfo['seqs']['v']))
コード例 #4
0
def add_single_indel(
        seq,
        indelfo,
        mean_length,
        codon_positions,
        indel_location=None,
        pos=None,
        keep_in_frame=False,
        debug=False):  # NOTE modifies <indelfo> and <codon_positions>
    # if <pos> is specified we use that, otherwise we use <indel_location> to decide the region of the sequence from which to choose a position
    if pos is None:
        if indel_location is None:  # uniform over entire sequence
            pos = random.randint(
                5,
                len(seq) - 6
            )  # this will actually exclude either before the first index or after the last index. No, I don't care.
        elif indel_location == 'v':  # within the meat of the v
            pos = random.randint(5, codon_positions['v'])
        elif indel_location == 'cdr3':  # inside cdr3
            pos = random.randint(codon_positions['v'], codon_positions['j'])
        else:
            assert False

    length = numpy.random.geometric(1. / mean_length)
    if keep_in_frame:
        itry = 0
        while length % 3 != 0:
            length = numpy.random.geometric(1. / mean_length)
            itry += 1
            if itry > 99:
                raise Exception(
                    'tried too many times to get in-frame indel length')

    if numpy.random.uniform(
            0, 1) < 0.5:  # fifty-fifty chance of insertion and deletion
        new_seq = add_insertion(indelfo, seq, pos, length, debug=debug)
    else:
        deleted_seq = seq[:pos] + seq[
            pos + length:]  # delete <length> bases beginning with <pos>
        indelfo['indels'].append({
            'type': 'deletion',
            'pos': pos,
            'len': length,
            'seqstr': seq[pos:pos + length]
        })
        if debug:
            print '          deleting %d bases at %d' % (length, pos)
        new_seq = deleted_seq

    for region in codon_positions:
        if pos < codon_positions[
                region]:  # this isn\'t right if the indel is actually in the codon, but in that case we just let the messed up codon through below
            codon_positions[region] += sign(indelfo['indels'][-1]) * length
    if not utils.codon_unmutated('cyst', new_seq, codon_positions['v']):
        print '  adding indel within %s codon' % 'cyst'

    return new_seq
コード例 #5
0
 def revert_conserved_codons(self, seq, debug=False):
     """ revert conserved cysteine and tryptophan to their original bases, eg if they were messed up by s.h.m. """
     for region, pos in self.post_erosion_codon_positions.items():  #  NOTE this happens *before* shm indels, i.e. we use self.post_erosion_codon_positions rather than self.final_codon_positions
         if seq[pos : pos + 3] != self.unmutated_codons[region]:
             assert len(self.unmutated_codons[region]) == 3
             if debug:
                 print '    reverting %s --> %s' % (seq[pos : pos + 3], self.unmutated_codons[region])
             seq = seq[:pos] + self.unmutated_codons[region] + seq[pos + 3 :]
         assert utils.codon_unmutated(utils.conserved_codons[self.glfo['locus']][region], seq, pos)
     return seq
コード例 #6
0
 def revert_conserved_codons(self, seq, debug=False):
     """ revert conserved cysteine and tryptophan to their original bases, eg if they were messed up by s.h.m. """
     for region, pos in self.post_erosion_codon_positions.items(
     ):  #  NOTE this happens *before* shm indels, i.e. we use self.post_erosion_codon_positions rather than self.final_codon_positions
         if seq[pos:pos + 3] != self.unmutated_codons[region]:
             assert len(self.unmutated_codons[region]) == 3
             if debug:
                 print '    reverting %s --> %s' % (
                     seq[pos:pos + 3], self.unmutated_codons[region]
                 )  # this doesn't happen *much* any more, but bppseqgen barfs if we pass it rates that are exactly zero, so it still happens sometimes
             seq = seq[:pos] + self.unmutated_codons[region] + seq[pos + 3:]
         assert utils.codon_unmutated(
             utils.conserved_codons[self.glfo['locus']][region], seq, pos)
     return seq
コード例 #7
0
def trim_and_remove_genes(region, gene, seq, glfo, template_glfo, debug=False):
    nearest_template_gene = glutils.find_nearest_gene_using_names(
        template_glfo, gene)
    nearest_template_seq = template_glfo['seqs'][region][nearest_template_gene]
    # extra_bases = glfo['cyst-positions'][gene] - template_glfo['cyst-positions'][nearest_template_gene]  # not right if there's some internal gaps in the alignment
    aligned_nearest_template_seq, aligned_seq = utils.align_seqs(
        nearest_template_seq, seq)

    if debug:
        print '    %s' % utils.color_gene(gene)
        utils.color_mutants(aligned_nearest_template_seq,
                            aligned_seq,
                            print_result=True,
                            ref_label='template ',
                            extra_str='       ')

    if aligned_seq[0] not in utils.gap_chars and aligned_nearest_template_seq[
            0] not in utils.gap_chars:
        if debug:
            print '      ok'
    elif aligned_seq[0] in utils.gap_chars:
        if debug:
            print '      %s, removing' % utils.color('red', 'too small')
        glutils.remove_gene(glfo, gene)
    else:
        if debug:
            print '        extra bases %s' % utils.color_gene(gene)
        extra_bases = len(aligned_nearest_template_seq) - len(
            aligned_nearest_template_seq.lstrip('-'))
        seq = seq[extra_bases:]
        if debug:
            print '          removed %d bases' % extra_bases
        if seq in glfo['seqs'][region].values():
            print '    trimmed seq already in glfo under name %s, so removing it' % ' '.join(
                [
                    utils.color_gene(g)
                    for g, s in glfo['seqs'][region].items() if s == seq
                ])
            glutils.remove_gene(glfo, gene, debug=True)
            return
        glfo['seqs'][region][gene] = seq
        glfo['cyst-positions'][gene] -= extra_bases
        # utils.color_mutants(nearest_template_seq, seq, print_result=True, ref_label='template ', align=True, extra_str='            ')
        assert utils.codon_unmutated('cyst',
                                     glfo['seqs'][region][gene],
                                     glfo['cyst-positions'][gene],
                                     debug=True)
コード例 #8
0
def check_a_bunch_of_codons(codon, seqons, extra_str='', debug=False):  # seqons: list of (seq, pos) pairs
    """ check a list of sequences, and keep track of some statistics """
    n_total, n_ok, n_too_short, n_bad_codons = 0, 0, 0, 0
    for seq, pos in seqons:
        n_total += 1
        if len(seq) < pos + 3:
            n_too_short += 1
        elif utils.codon_unmutated(codon, seq, pos):
            n_ok += 1
        else:
            n_bad_codons += 1

    if debug:
        print '%s%d %s positions:' % (extra_str, n_total, codon),
        if n_ok > 0:
            print '  %d ok' % n_ok,
        if n_too_short > 0:
            print '  %d too short' % n_too_short,
        if n_bad_codons > 0:
            print '  %d mutated' % n_bad_codons,
        print ''
コード例 #9
0
 def choose_position():
     snp_pos = None
     while snp_pos is None or snp_pos in snpd_positions or not utils.codon_unmutated('cyst', tmpseq, cpos, debug=True):
         snp_pos = random.randint(0, len(seq) - 1)  # note that randint() is inclusive
         tmpseq = seq[: snp_pos] + 'X' + seq[snp_pos + 1 :]  # for checking cyst position
     return snp_pos
コード例 #10
0
def get_missing_codon_info(glfo, debug=False):
    # debug = 2

    for region, codon in utils.conserved_codons[glfo['locus']].items():
        missing_genes = set(glfo['seqs'][region]) - set(glfo[codon + '-positions'])
        if len(missing_genes) == 0:
            if debug:
                print '      no missing %s info' % codon
            continue

        if debug:
            print '      missing %d %s positions' % (len(missing_genes), codon)

        aligned_seqs = get_new_alignments(glfo, region, debug=debug)

        # if region == 'j':
        #     raise Exception('missing tryp position for %s, and we can\'t infer it because tryp positions don\'t reliably align to the same position' % ' '.join(missing_genes))

        # existing codon position (this assumes that once aligned, all genes have the same codon position -- which is only really true for the imgt-gapped alignment)
        if len(glfo[codon + '-positions']) > 0:
            known_gene, known_pos = None, None
            known_but_not_in_glfo, known_but_unaligned, known_but_mutated = [], [], []
            for gene, pos in glfo[codon + '-positions'].items():  # take the first one for which we have the sequence (NOTE it would be safer to check that they're all the same)
                if gene not in glfo['seqs'][region]:
                    known_but_not_in_glfo.append(gene)
                    continue
                if gene not in aligned_seqs:
                    known_but_unaligned.append(gene)
                    continue
                if not utils.codon_unmutated(codon, glfo['seqs'][region][gene], pos):
                    known_but_mutated.append(gene)
                    continue
                known_gene, known_pos = gene, pos
                break
            if known_gene is None:
                raise Exception('couldn\'t find a known %s position\n    known but not in glfo: %s\n    known but unaligned: %s\n    known but mutated: %s' % (codon, ' '.join(known_but_not_in_glfo), ' '.join(known_but_unaligned), ' '.join(known_but_mutated)))
            # NOTE for cyst, should be 309 if alignments are imgt [which they used to usually be, but now probably aren't] (imgt says 104th codon --> subtract 1 to get zero-indexing, then multiply by three 3 * (104 - 1) = 309
            known_pos_in_alignment = get_pos_in_alignment(codon, aligned_seqs[known_gene], glfo['seqs'][region][known_gene], known_pos, debug=debug)
            if debug:
                print '  using known position %d (aligned %d) from %s' % (known_pos, known_pos_in_alignment, known_gene)
        elif codon == 'cyst':
            known_pos_in_alignment = 309
            print '      assuming aligned %s position is %d (this will %s work if you\'re using imgt alignments)' % (codon, known_pos_in_alignment, utils.color('red', 'only'))
            raise Exception('not really using imgt alignments much any more, so this isn\'t really going to work')
        else:
            raise Exception('no existing %s info, and couldn\'t guess it, either' % codon)

        n_added = 0
        seqons = []  # (seq, pos) pairs
        for gene in [known_gene] + list(missing_genes):
            unaligned_pos = known_pos_in_alignment - utils.count_gaps(aligned_seqs[gene], istop=known_pos_in_alignment)
            seq_to_check = glfo['seqs'][region][gene]
            seqons.append((seq_to_check, unaligned_pos))
            glfo[codon + '-positions'][gene] = unaligned_pos
            n_added += 1
            if debug > 1:
                tmpseq = aligned_seqs[gene]
                tmppos = known_pos_in_alignment
                print '            %s%s%s   %s %3s %5s' % (tmpseq[:tmppos], utils.color('reverse_video', tmpseq[tmppos : tmppos + 3]), tmpseq[tmppos + 3:], utils.color_gene(gene, width=12 if region == 'v' else 8),
                                                      '' if tmpseq[tmppos : tmppos + 3] in utils.codon_table[codon] else utils.color('red', 'bad'),
                                                      'new' if gene != known_gene else '')

        check_a_bunch_of_codons(codon, seqons, extra_str='          ', debug=debug)
        if debug:
            print '      added %d %s positions' % (n_added, codon)
コード例 #11
0
def get_pos_in_alignment(codon, aligned_seq, seq, pos, debug=False):
    """ given <pos> in <seq>, find the codon's position in <aligned_seq> """
    assert utils.codon_unmutated(codon, seq, pos, debug=debug)  # this only gets called on the gene with the *known* position, so it shouldn't fail
    pos_in_alignment = pos + get_n_gaps_up_to_pos(aligned_seq, pos)
    assert utils.codon_unmutated(codon, aligned_seq, pos_in_alignment, debug=debug)
    return pos_in_alignment