Example #1
0
def getJsnps(seq, genes, gene2pos):
    pos2snp = {}
    jfrag = seq.nuc[seq.jindex:].upper()
    jseq = genes[seq.jgene].upper()
    
    #Find where the jfrag is in the jseq:
    matches = re.finditer(jfrag, jseq)
    starts = [ m.start() for m in matches ]
    okstarts = []

    pos = gene2pos[seq.jgene] + 3
    for s in starts:
        e = s + len(jfrag)
        if s <= 15 and e >= 10 and e < pos: #the primer should be somewhere to the left (5'end) of the J gene and should cover somewhere after the 10 leftmost nucleotides
            extraseq = jseq[e :pos]
            cdr3len = len(seq.nuc) - seq.vindex + len(extraseq)
            if cdr3len%3 == 0: #inframe
                #Translate to cdr3:
                newNuc = seq.nuc + extraseq
                newaa = iseqlib.nt2aa( newNuc[seq.vindex:] )
                if newaa[ len(newaa) -1 ] == 'F' and '*' not in newaa:
                    if seq.aa not in newaa:
                        sys.stderr.write('Warning: the new infered aa doesnot contain current aa\n')
                    okstarts.append(s)
    if len(okstarts) > 1:
        sys.stderr.write("Looking for SNPs if any on the J fragment. However, mapped to multiple Js. Sequence: %s, %s, %s\n" %(seq.id, seq.nuc, seq.aa))
        sys.exit(1)

    startPos = starts[0]
    endPos = min( [ startPos + len(jfrag), gene2pos[seq.jgene] + 3 ] )
    if endPos > len(jseq):
        sys.stderr.write("Clone goes beyond downstream of J gene\n")
    refjfrag = jseq[startPos:endPos]
    seq.jrefstart = startPos
    return getSnps(jfrag, refjfrag, startPos)
Example #2
0
def fillInSeq(seq, genes, gene2pos):
    '''The earlier version of adaptiveTCR tsv files did not always have the complete CDR3 sequences (because reads only support partial CDR3)
    This function looked up the original J gene sequences that the clone to mapped to, and fill in the its sequences to have complete CDR3.
    '''
    jfrag = seq.nuc[seq.jindex:].upper()
    jseq = genes[seq.jgene].upper()

    pos = gene2pos[seq.jgene] + 3
    #find where the jfrag is in the jseq:
    matches = re.finditer(jfrag, jseq)
    starts = [ m.start() for m in matches ]
    #Now finding all possible matches:
    fullseqs = []
    newcdr3s = []

    for s in starts:
        e = s + len(jfrag)
        if s <= 15 and e >= 10 and e < pos: #the primer should be somewhere to the left (5'end) of the J gene and should cover somewhere after the 10 leftmost nucleotides
            extraseq = jseq[e :pos]
            cdr3len = len(seq.nuc) - seq.vindex + len(extraseq)
            if cdr3len%3 == 0: #inframe
                #Translate to cdr3:
                newNuc = seq.nuc + extraseq
                newaa = iseqlib.nt2aa( newNuc[seq.vindex:] )
                if newaa[ len(newaa) -1 ] == 'F' and '*' not in newaa:
                    if seq.aa not in newaa:
                        sys.stderr.write('Warning: the new infered aa doesnot contain current aa\n')
                    fullseqs.append( newNuc )
                    newcdr3s.append( newaa )
    if len(fullseqs) == 0:
        sys.stderr.write('Attempted to fill in the right side of the nuc and CDR3 sequences. Zero productive matches found. Sequence looked at was: %s, %s, %s\n' %(seq.id, seq.nuc, seq.aa))
        sys.stderr.write("jfrag: %s, jseq: %s, matchesStarts: *%s*\n" %(jfrag, jseq, ','.join(starts)) )
        return -1
    elif len(fullseqs) > 1:
        sys.stderr.write('Attempted to fill in the right side of the nuc and CDR3 sequences. Multiple productive matches found - could not decide. Sequence looked at was: %s, %s, %s\n' %(seq.id, seq.nuc, seq.aa))
        return -1
    else:
        seq.nuc = fullseqs[0]
        seq.aa = newcdr3s[0]
        seq.jindex = seq.vindex + len(seq.aa)*3 
        seq.cdr3nuc = seq.nuc[seq.vindex:seq.jindex]
        seq.inframenuc = seq.nuc[seq.vindex%3: len(seq.nuc) - ((len(seq.nuc) - seq.vindex%3)%3)]
        seq.longaa = iseqlib.nt2aa( seq.inframenuc ) 
    return 1
Example #3
0
 def __init__(self, name, seq, count, vs, js, freq):
     self.name = name
     self.seq = seq
     self.count = count
     self.freq = freq
     self.vs = sorted(vs)
     self.js = sorted(js)
     vstr = ','.join(vs)
     jstr = ','.join(js)
     self.aa = iseqlib.nt2aa(self.seq)
     #self.header =  '|'.join([seq, vstr, jstr]) #header example: CASSLRRGGKPGELFF|TRBV5-4|TRBJ2-2
     self.header =  '|'.join([self.aa, vstr, jstr]) #header example: CASSLRRGGKPGELFF|TRBV5-4|TRBJ2-2
Example #4
0
 def __init__(self, name, seq, count, vs, js, freq, translate):
     self.name = name
     self.samples = name.split(',')
     self.seq = seq
     self.count = count
     self.freq = freq
     self.vs = sorted(vs)
     self.js = sorted(js)
     #vstr = ','.join(vs)
     #jstr = ','.join(js)
     vfams = getGeneFamilies(self.vs)
     #jfams = getGeneFamilies(self.js)
     vstr = ','.join(vfams)
     jstr = ','.join(js)
     
     if translate:
         seq = iseqlib.nt2aa(seq) 
     self.header =  '|'.join([seq, vstr, jstr]) #header example: CASSLRRGGKPGELFF|TRBV5|TRBJ2-2  (aminoacid|vfamilies|jgenes)
Example #5
0
    def __init__(self, line):
        items = line.strip().split('\t')
        if len(items) < 27:
            sys.stderr.write('Wrong tsv format. Expected 27 fields, only have %d\n%s\n' %(len(items), line))
            sys.exit(1)
        self.id = items[0]
        self.nuc = items[2]
        self.aa = items[3]
        self.normFreq = -1.0
        self.normCount = -1
        if items[4] != '':
            self.normFreq = float(items[4])
        if items[5] != '':
            self.normCount = int(items[5])
        self.freq = float(items[6])
        self.count = int(items[7])
        self.cdr3len = int(items[8])
        self.vfam = items[9]
        self.vgene = items[10]
        self.vties = items[11]

        vgenes = self.vgene.split('/')
        if len(vgenes) > 1:
            self.vgene = vgenes[0]
            vs = self.vties.split(', ')
            for v in vgenes[1:]:
                if v not in vs:
                    vs.append(v)
            self.vties = ', '.join(vs)

        self.vgene = self.vgene.split('*')[0]
        
        self.dgene = items[12]
        self.jgene = items[13]
        self.jties = items[14]
        
        jgenes = self.jgene.split('/')
        if len(jgenes) > 1:
            self.jgene = jgenes[0]
            js = self.jties.split(', ')
            for j in jgenes[1:]:
                if j not in js:
                    js.append(j)
            self.jties = ', '.join(js)
        self.jgene = self.jgene.split('*')[0]
        
        self.vdel = int(items[15])
        self.d5del = int(items[16])
        self.d3del = int(items[17])
        self.jdel = int(items[18])
        self.n2ins = int(items[19])
        self.n1ins = int(items[20])
        self.status = items[21]
        self.vindex = int(items[22])
        self.n1index = int(items[23])
        self.n2index = int(items[24])
        self.dindex = int(items[25])
        self.jindex = int(items[26])
        
        self.cdr3nuc = self.nuc[self.vindex:self.jindex]
        self.inframenuc = self.nuc[ self.vindex%3: len(self.nuc) - ((len(self.nuc) - self.vindex%3)%3) ]
        self.longaa = iseqlib.nt2aa( self.inframenuc ) 

        self.vpos2snp = None
        self.jpos2snp = None