Esempio n. 1
0
def get_v_cdr3_nucseq( organism, v_gene, paranoid = False ):

    ab = v_gene[2]

    v_nucseq  = read_sanger_data.all_fasta[organism][ab]['V'][read_sanger_data.nuc ][v_gene]
    v_nucseq_offset = read_sanger_data.all_offsets[organism][ab]['V'][v_gene]
    v_nucseq = v_nucseq[ v_nucseq_offset: ]

    if paranoid:
        v_protseq = read_sanger_data.all_fasta[organism][ab]['V'][read_sanger_data.prot][v_gene]
        assert read_sanger_data.get_translation( v_nucseq, '+1' )[0].startswith( v_protseq )

    v_alseq = cdr3s_human.all_align_fasta[organism][ v_gene ]
    alseq_cpos = cdr3s_human.alseq_C_pos[organism][ab]-1 ## 0-indexed
    numgaps = v_alseq[:alseq_cpos].count('.')
    v_cpos = alseq_cpos - numgaps
    v_nucseq = v_nucseq[3*v_cpos:] ## now v_nucseq starts with the 'C' codon


    if organism == 'mouse':
        if v_gene == 'TRAV13D-1*01':
            #-----------------------------------
            #../../../tmp.blat:mismatch: V 6 imgt: a genome: t TRAV13D-1*01
            #tmp.whoah:whoah  6 act: t  98.7 exp: a   1.1 TRAV13D-1*01 TRAV13-1*01 620
            #tmp.whoah:whoah: expected: caaggtatcgtgt consensus: caaggtttcgtgt TRAV13D-1*01 620
            #tmp.3.whoah:whoah  6 act: t  97.4 exp: a   1.4 TRAV13D-1*01 TRAV13-1*01 642
            #tmp.3.whoah:whoah: expected: caaggtatcgtgt consensus: caaggtttcgtgt TRAV13D-1*01 642
            #tmp.la_mc.whoah:whoah  6 act: t  89.0 exp: a   7.0 TRAV13D-1*01 TRAV13-1*01 100
            #tmp.la_mc.whoah:whoah: expected: caaggtatcgtgt consensus: caaggtttcgtgt TRAV13D-1*01 100
            assert v_nucseq == 'tgtgctatggaac' ## CAM
            v_nucseq         = 'tgtgctttggaac' ## CAL


    return v_nucseq
Esempio n. 2
0
def get_j_cdr3_nucseq( organism, j_gene, paranoid = False ):
    ab = j_gene[2]
    j_nucseq  = read_sanger_data.all_fasta[organism][ab]['J'][read_sanger_data.nuc ][j_gene]

    j_nucseq_offset = read_sanger_data.all_offsets[organism][ab]['J'][j_gene]
    if paranoid:
        j_protseq = read_sanger_data.all_fasta[organism][ab]['J'][read_sanger_data.prot][j_gene]
        assert read_sanger_data.get_translation( j_nucseq, '+{}'.format(j_nucseq_offset+1) )[0].startswith( j_protseq )

    num_genome_j_positions_in_loop = cdr3s_human.all_num_genome_j_positions_in_loop[organism][ab][j_gene] + 2 ## to GXG
    ## trim j_nucseq so that it extends up to the F/W position
    j_nucseq = j_nucseq[:3*num_genome_j_positions_in_loop + j_nucseq_offset]


    if organism == 'mouse':
        if j_gene == 'TRAJ47*01':
            # -----------------------------------
            # ../../../tmp.blat:mismatch: J 2 imgt: c genome: g TRAJ47*01
            # ../../../tmp.blat:mismatch: J 24 imgt: g genome: t TRAJ47*01
            # tmp.whoah:whoah  2 act: g  81.9 exp: c   4.7 TRAJ47*01 TRAJ47*01 1412
            # tmp.whoah:whoah 24 act: t  82.7 exp: g  16.8 TRAJ47*01 TRAJ47*01 1412
            # tmp.whoah:whoah: expected: tgcactatgcaaacaagatgatctgt consensus: tggactatgcaaacaagatgatcttt TRAJ47*01 1412
            # tmp.3.whoah:whoah  2 act: g  81.6 exp: c   5.0 TRAJ47*01 TRAJ47*01 1362
            # tmp.3.whoah:whoah 24 act: t  82.7 exp: g  16.6 TRAJ47*01 TRAJ47*01 1362
            # tmp.3.whoah:whoah: expected: tgcactatgcaaacaagatgatctgt consensus: tggactatgcaaacaagatgatcttt TRAJ47*01 1362
            # tmp.la_mc.whoah:whoah  2 act: g  79.6 exp: c   5.3 TRAJ47*01 TRAJ47*01 113
            # tmp.la_mc.whoah:whoah 24 act: t  99.1 exp: g   0.9 TRAJ47*01 TRAJ47*01 113
            # tmp.la_mc.whoah:whoah: expected: tgcactatgcaaacaagatgatctgt consensus: tggactatgcaaacaagatgatcttt TRAJ47*01 113
            assert j_nucseq == 'tgcactatgcaaacaagatgatctgt' ## C at end
            j_nucseq         = 'tggactatgcaaacaagatgatcttt' ## F at end
        elif j_gene == 'TRAJ24*01':
            # -----------------------------------
            # ../../../tmp.blat:unaligned: J 0 TRAJ24*01
            # ../../../tmp.blat:unaligned: J 1 TRAJ24*01
            # ../../../tmp.blat:gapped: J 6 TRAJ24*01
            # tmp.whoah:whoah  2 act: c  60.3 exp: a  15.3 TRAJ24*01 TRAJ24*01 464
            # tmp.whoah:whoah  4 act: a  88.6 exp: c   2.8 TRAJ24*01 TRAJ24*01 464
            # tmp.whoah:whoah  5 act: c  93.3 exp: t   1.5 TRAJ24*01 TRAJ24*01 464
            # tmp.whoah:whoah  6 act: t  97.2 exp: g   1.1 TRAJ24*01 TRAJ24*01 464
            # tmp.whoah:whoah: expected: tgaactggccagtttggggaaactgcagttt consensus: gacaactgccagtttggggaaactgcagttt TRAJ24*01 464
            # tmp.3.whoah:whoah  2 act: c  60.8 exp: a  13.9 TRAJ24*01 TRAJ24*01 475
            # tmp.3.whoah:whoah  4 act: a  86.3 exp: c   4.2 TRAJ24*01 TRAJ24*01 475
            # tmp.3.whoah:whoah  5 act: c  94.5 exp: t   1.1 TRAJ24*01 TRAJ24*01 475
            # tmp.3.whoah:whoah  6 act: t  98.1 exp: g   1.1 TRAJ24*01 TRAJ24*01 475
            # tmp.3.whoah:whoah: expected: tgaactggccagtttggggaaactgcagttt consensus: gacaactgccagtttggggaaactgcagttt TRAJ24*01 475
            # tmp.la_mc.whoah:whoah  2 act: c  75.3 exp: a   4.3 TRAJ24*01 TRAJ24*01 93
            # tmp.la_mc.whoah:whoah  4 act: a  89.2 exp: c   2.2 TRAJ24*01 TRAJ24*01 93
            # tmp.la_mc.whoah:whoah  5 act: c  97.8 exp: t   1.1 TRAJ24*01 TRAJ24*01 93
            # tmp.la_mc.whoah:whoah  6 act: t  98.9 exp: g   0.0 TRAJ24*01 TRAJ24*01 93
            # tmp.la_mc.whoah:whoah: expected: tgaactggccagtttggggaaactgcagttt consensus: gacaactgccagtttggggaaactgcagttt TRAJ24*01 93
            assert j_nucseq == 'tgaactggccagtttggggaaactgcagttt'
            j_nucseq         = 'gacaactgccagtttggggaaactgcagttt'
            ## take the consensus
            ## given that there's an indel (and the alignment to the genome starts at j sequence position 3)
            ## it's hard to tell what to do at the beginning...



    return j_nucseq
Esempio n. 3
0
                    Log('skipping: badseq: {} {}'.format(
                        cdr3a_protseq, cdr3b_protseq))
                    skip_me = True
                    break

        if skip_me:
            continue

    ## probs are computed by reps
    va_reps = l['va_reps'].split(';')
    ja_reps = l['ja_reps'].split(';')
    va_countreps = l['va_countreps'].split(';')
    ja_countreps = l['ja_countreps'].split(';')
    va_cdr3_nucseq = tcr_sampler.get_v_cdr3_nucseq(organism, va_gene)
    ja_cdr3_nucseq = tcr_sampler.get_j_cdr3_nucseq(organism, ja_gene)
    va_cdr3_protseq, codons = read_sanger_data.get_translation(
        va_cdr3_nucseq, '+1')
    ja_cdr3_protseq, codons = read_sanger_data.get_translation(
        ja_cdr3_nucseq, '+{}'.format(1 + len(ja_cdr3_nucseq) % 3))

    aprob_nucseq, new_cdr3a_nucseq = tcr_sampler.alpha_cdr3_protseq_probability(
        organism,
        va_gene,
        ja_gene,
        cdr3_protseq='',
        cdr3_nucseq=cdr3a_nucseq,
        verbose=verbose,
        return_final_cdr3_nucseq=True)

    if new_cdr3a_nucseq != cdr3a_nucseq:  ## note note note
        print 'new_cdr3a_nucseq:', len(new_cdr3a_nucseq), new_cdr3a_nucseq
        print 'old_cdr3a_nucseq:', len(cdr3a_nucseq), cdr3a_nucseq
Esempio n. 4
0
def beta_cdr3_protseq_probability( organism, v_gene, j_gene, cdr3_protseq,
                                   cdr3_nucseq = '', error_threshold = 0.05, verbose=False,
                                   allow_early_nucseq_mismatches = True,
                                   return_final_cdr3_nucseq = False ):
    nucleotide_match = ( cdr3_nucseq != '' )
    if nucleotide_match:
        assert not cdr3_protseq
        cdr3_protseq = read_sanger_data.get_translation( cdr3_nucseq, '+1' )[0]
        assert len(cdr3_nucseq) == 3 * len(cdr3_protseq )

    ab = 'B'
    assert v_gene[2] == 'B'

    v_nucseq = get_v_cdr3_nucseq( organism, v_gene )
    j_nucseq = get_j_cdr3_nucseq( organism, j_gene )

    ## what is the largest amount of these nucseqs we could preserve and still get cdr3_protseq
    max_v_germline = 0
    max_j_germline = 0

    len_v_nucseq = len(v_nucseq)
    len_j_nucseq = len(j_nucseq)
    len_cdr3_nucseq = len(cdr3_nucseq)
    len_cdr3_protseq = len(cdr3_protseq)

    if nucleotide_match:
        if allow_early_nucseq_mismatches:
            mismatch_score = default_mismatch_score_for_cdr3_nucseq_probabilities
        else:
            mismatch_score = -100
        max_v_germline = count_matches( v_nucseq, cdr3_nucseq, mismatch_score )

        max_j_germline = count_matches( ''.join( reversed( list( j_nucseq ) )),
                                        ''.join( reversed( list( cdr3_nucseq ))),
                                        mismatch_score )

        if allow_early_nucseq_mismatches: ## obliterate the mismatches now
            max_v, max_j = max_v_germline, max_j_germline
            if max_v + max_j > len(cdr3_nucseq):
                ## some overlap!
                extra = max_v + max_j - len(cdr3_nucseq )
                #print 'TRIM extra',extra
                fake_v_trim = extra/2 ## now dterministic
                fake_j_trim = extra - fake_v_trim
                max_v -= fake_v_trim
                max_j -= fake_j_trim
            old_cdr3_nucseq = cdr3_nucseq[:]
            cdr3_nucseq = v_nucseq[:max_v] + \
                          cdr3_nucseq[ max_v : len_cdr3_nucseq-max_j ] + \
                          j_nucseq[len_j_nucseq-max_j:]
            if old_cdr3_nucseq != cdr3_nucseq:
                Log('early_cdr3a_nucseq_mismatch: before {} after {}'.format( old_cdr3_nucseq, cdr3_nucseq ) )
                assert len(cdr3_nucseq) == len(old_cdr3_nucseq)

    else:
        ## V
        for i in range( len(v_nucseq)):
            i_aa = i/3 ## which aa do we code for?
            len_codon = (i%3) + 1
            if i_aa >= len(cdr3_protseq): break
            start = 3*i_aa
            codon = v_nucseq[ start:start+len_codon]
            target_aa = cdr3_protseq[ i_aa ]
            matched = False
            for c in reverse_genetic_code[target_aa]:
                if c.startswith(codon):
                    matched = True
            if verbose:
                print 'V',codon, target_aa, matched
            if matched:
                max_v_germline = i+1
            else:
                break

        ## J
        for i in range( len_j_nucseq):
            i_aa = i/3 ## which aa do we code for?
            len_codon = (i%3) + 1
            if i_aa >= len(cdr3_protseq): break
            end   = len(j_nucseq)-3*i_aa
            codon = j_nucseq[max(0,end-len_codon):end]
            target_aa = cdr3_protseq[ len_cdr3_protseq-1-i_aa ]
            matched = False
            for c in reverse_genetic_code[target_aa]:
                if c.endswith(codon):
                    matched = True
            if verbose:
                print 'J',codon, target_aa, matched
            if matched:
                max_j_germline = i+1
            else:
                break


    if verbose:
        print 'max_v_germline:',max_v_germline, len(v_nucseq)

    ## how about J?

    min_insert = 3*len_cdr3_protseq - max_v_germline - max_j_germline
    if verbose:
        print 'max_j_germline:',max_j_germline, len_j_nucseq,cdr3_protseq,\
            read_sanger_data.all_fasta[organism][ab]['J'][read_sanger_data.prot][j_gene]

        print 'min_insert:',min_insert,max_v_germline,max_j_germline

    trbj_index = int( j_gene[4] ) ## to decide which d genes to allow
    assert trbj_index in [1,2]

    total_prob = 0.0
    min_extra_trim = max(0,-1*min_insert)

    dids = tcr_rearrangement.all_trbd_nucseq[organism].keys()
    for extra_trim in range(min_extra_trim,100):
        old_total_prob = total_prob
        total_prob_this_trim = 0.0
        for extra_v_trim in range(0,extra_trim+1):
            extra_j_trim = extra_trim - extra_v_trim

            v_trim = len_v_nucseq - max_v_germline + extra_v_trim
            j_trim = len_j_nucseq - max_j_germline + extra_j_trim
            if v_trim > len_v_nucseq or j_trim > len_j_nucseq: continue

            n_insert = min_insert + extra_v_trim + extra_j_trim
            assert n_insert>=0 ## b/c of min_extra_trim
            total_prob_this_insert = 0.0

            ## now we are looking to fit part of the D gene into this middle region and still code for the right aas
            for did in dids:
                if trbj_index == 1:
                    if did == 1:
                        did_prob = 1.0
                    else:
                        continue
                else:
                    did_prob = 1.0/float(len(dids))
                d_nucseq = tcr_rearrangement.all_trbd_nucseq[organism][did]
                len_d_nucseq = len( d_nucseq )
                for d0_trim in range(len_d_nucseq+1):
                    for d1_trim in range(len_d_nucseq+1):
                        len_d_insert = len_d_nucseq - d0_trim - d1_trim
                        if len_d_insert < 0 or len_d_insert > n_insert: continue
                        #if len_d_insert == 0 and d1_trim: continue ## only hit this one once!
                        d_insert = d_nucseq[ d0_trim: len_d_nucseq-d1_trim]
                        num_n = n_insert - len_d_insert
                        for num_n_before_d in range(num_n+1):
                            num_n_after_d = num_n - num_n_before_d
                            assert num_n_after_d>=0

                            n_nucseq = ( v_nucseq[:len_v_nucseq-v_trim] +
                                         'n'*num_n_before_d + d_insert + 'n'*num_n_after_d +
                                         j_nucseq[j_trim:] )

                            assert len(n_nucseq) == 3*len_cdr3_protseq

                            trim_prob = tcr_rearrangement.get_beta_trim_probs( organism, did,
                                                                               v_trim, d0_trim, d1_trim, j_trim,
                                                                               num_n_before_d, num_n_after_d )
                            if not trim_prob: continue

                            if nucleotide_match:
                                assert len(n_nucseq) == len_cdr3_nucseq
                                matched = True
                                #print n_nucseq, cdr3_nucseq
                                for a,b in zip( n_nucseq, cdr3_nucseq ):
                                    if a!=b and a!= 'n':
                                        matched=False
                                if matched:
                                    coding_prob = 0.25 ** num_n
                                else:
                                    coding_prob = 0.0
                            else:
                                coding_prob = get_coding_probability( n_nucseq, cdr3_protseq )
                            prob = did_prob * coding_prob * trim_prob

                            total_prob_this_insert += prob ## just for status output
                            total_prob_this_trim += prob
                            total_prob += prob

                            if verbose and coding_prob:
                                print 'coding_prob:',cdr3_protseq,"trims:",v_trim,d0_trim,d1_trim,j_trim,\
                                    "inserts:",num_n_before_d,num_n_after_d,\
                                    "d_insert:",d_insert,\
                                    "total_prob:",total_prob,"prob:",prob,"coding_prob:",coding_prob,\
                                    "trim_prob:",trim_prob,n_nucseq


            if verbose:
                print 'n_insert:',n_insert,extra_v_trim,extra_j_trim,'total_prob:',total_prob,\
                    'total_prob_this_insert:',total_prob_this_insert


        if extra_trim>2 and total_prob_this_trim < error_threshold * old_total_prob:
            break

    if return_final_cdr3_nucseq:
        return total_prob, cdr3_nucseq
    else:
        return total_prob
Esempio n. 5
0
def alpha_cdr3_protseq_probability( organism, v_gene, j_gene, cdr3_protseq,
                                    cdr3_nucseq = '', error_threshold = 0.05, verbose=False,
                                    allow_early_nucseq_mismatches = True,
                                    return_final_cdr3_nucseq = False ):
    #assert organism == 'mouse' ## need new stats for human

    nucleotide_match = ( cdr3_nucseq != '' )
    if nucleotide_match:
        assert not cdr3_protseq
        cdr3_protseq = read_sanger_data.get_translation( cdr3_nucseq, '+1' )[0]
        assert len(cdr3_nucseq) == 3 * len(cdr3_protseq )

    ab = 'A'
    assert v_gene[2] == 'A'

    v_nucseq = get_v_cdr3_nucseq( organism, v_gene )
    j_nucseq = get_j_cdr3_nucseq( organism, j_gene )

    ## what is the largest amount of these nucseqs we could preserve and still get cdr3_protseq

    max_v_germline = 0
    len_v_nucseq = len(v_nucseq)
    max_j_germline = 0

    len_j_nucseq = len(j_nucseq)
    len_cdr3_protseq = len(cdr3_protseq)
    len_cdr3_nucseq = len(cdr3_nucseq)

    if nucleotide_match:
        if allow_early_nucseq_mismatches:
            mismatch_score = default_mismatch_score_for_cdr3_nucseq_probabilities
        else:
            mismatch_score = -100
        max_v_germline = count_matches( v_nucseq, cdr3_nucseq, mismatch_score )

        max_j_germline = count_matches( ''.join( reversed( list( j_nucseq ) )),
                                        ''.join( reversed( list( cdr3_nucseq ))),
                                        mismatch_score )

        if allow_early_nucseq_mismatches: ## obliterate the mismatches now
            max_v, max_j = max_v_germline, max_j_germline
            if max_v + max_j > len(cdr3_nucseq):
                ## some overlap!
                extra = max_v + max_j - len(cdr3_nucseq )
                #print 'TRIM extra',extra
                fake_v_trim = extra/2 ## now dterministic
                fake_j_trim = extra - fake_v_trim
                max_v -= fake_v_trim
                max_j -= fake_j_trim
            old_cdr3_nucseq = cdr3_nucseq[:]
            cdr3_nucseq = v_nucseq[:max_v] + \
                          cdr3_nucseq[ max_v : len_cdr3_nucseq-max_j ] + \
                          j_nucseq[len_j_nucseq-max_j:]
            if old_cdr3_nucseq != cdr3_nucseq:
                Log('early_cdr3a_nucseq_mismatch: {} {} before {} after {}'.format( v_gene, j_gene,
                                                                                    old_cdr3_nucseq, cdr3_nucseq ) )
                assert len(cdr3_nucseq) == len(old_cdr3_nucseq)
    else:

        for i in range( len(v_nucseq)):
            i_aa = i/3 ## which aa do we code for?
            len_codon = (i%3) + 1
            if i_aa >= len(cdr3_protseq): break
            start = 3*i_aa
            codon = v_nucseq[ start:start+len_codon]
            target_aa = cdr3_protseq[ i_aa ]
            matched = False
            for c in reverse_genetic_code[target_aa]:
                if c.startswith(codon):
                    matched = True
            if verbose:
                print 'V',codon, target_aa, matched
            if matched:
                max_v_germline = i+1
            else:
                break

        ## how about J?
        for i in range( len_j_nucseq):
            i_aa = i/3 ## which aa do we code for?
            len_codon = (i%3) + 1
            if i_aa >= len(cdr3_protseq): break
            end   = len(j_nucseq)-3*i_aa
            codon = j_nucseq[max(0,end-len_codon):end]
            target_aa = cdr3_protseq[ len_cdr3_protseq-1-i_aa ]
            matched = False
            for c in reverse_genetic_code[target_aa]:
                if c.endswith(codon):
                    matched = True
            if verbose:
                print 'J',codon, target_aa, matched
            if matched:
                max_j_germline = i+1
            else:
                break


    min_insert = 3*len_cdr3_protseq - max_v_germline - max_j_germline
    if verbose:
        print 'max_v_germline:',max_v_germline, len_v_nucseq, v_nucseq, cdr3_nucseq

        print 'max_j_germline:',max_j_germline, len_j_nucseq, j_nucseq, cdr3_nucseq, \
            read_sanger_data.all_fasta[organism][ab]['J'][read_sanger_data.prot][j_gene]

        print 'min_insert:',min_insert,max_v_germline,max_j_germline

    total_prob = 0.0
    min_extra_trim = max(0,-1*min_insert)
    for extra_trim in range(min_extra_trim,100):
        old_total_prob = total_prob
        total_prob_this_trim = 0.0
        for extra_v_trim in range(0,extra_trim+1):
            extra_j_trim = extra_trim - extra_v_trim

            v_trim = len_v_nucseq - max_v_germline + extra_v_trim
            j_trim = len_j_nucseq - max_j_germline + extra_j_trim
            if v_trim > len_v_nucseq or j_trim > len_j_nucseq: continue

            n_insert = min_insert + extra_v_trim + extra_j_trim
            n_nucseq = v_nucseq[:len_v_nucseq-v_trim] + 'n'*n_insert + j_nucseq[j_trim:]

            assert len(n_nucseq) == 3*len_cdr3_protseq
            if nucleotide_match:
                coding_prob = 0.25 ** n_insert
            else:
                coding_prob = get_coding_probability( n_nucseq, cdr3_protseq )

            trim_prob = tcr_rearrangement.get_alpha_trim_probs( organism, v_trim, j_trim, n_insert )

            total_prob_this_trim += coding_prob * trim_prob
            total_prob += coding_prob * trim_prob

            if verbose:
                print 'coding_prob:',cdr3_protseq,v_trim,j_trim,n_insert,total_prob,coding_prob,trim_prob,n_nucseq

        if extra_trim>2 and total_prob_this_trim < error_threshold * old_total_prob:
            break
    if return_final_cdr3_nucseq:
        return total_prob, cdr3_nucseq
    else:
        return total_prob