def sequenta_getaa(nuc): if re.match("N", nuc): assert len(nuc) >= 3 aa = "X" + libcommon.nt2aa(nuc[3:]) else: aa = libcommon.nt2aa(nuc) return aa
def run(self): events = [] batchsize = 100000 currbatch = 0 items = self.clone.split('_') v = items[0] cdr3_aa = items[1] j = items[2] if self.devent.d3del == 0: d_cdr3_nt = self.d_nt[self.devent.d5del: ] else: d_cdr3_nt = self.d_nt[self.devent.d5del: -1 * self.devent.d3del] for vdel in os.listdir(self.vdir): vfile = os.path.join(self.vdir, vdel) vd_ins_nts = pickle.load(gzip.open(vfile, 'rb')) vdel = int(vdel) self.logToMaster("vdel: %d; vd_ins_nts: %d" % (vdel, len(vd_ins_nts))) v_cdr3_nt = self.v_nt if vdel == 0 else self.v_nt[: -1 * vdel] for jdel in os.listdir(self.jdir): jfile = os.path.join(self.jdir, jdel) dj_ins_nts = pickle.load(gzip.open(jfile, 'rb')) jdel = int(jdel) self.logToMaster("jdel: %d; dj_ins_nts: %d" % (jdel, len(dj_ins_nts))) j_cdr3_nt = self.j_nt if jdel == 0 else self.j_nt[jdel: ] for vd_ins in vd_ins_nts: for dj_ins in dj_ins_nts: cdr3_nt = (v_cdr3_nt + vd_ins[1: ] + d_cdr3_nt + dj_ins[: -1] + j_cdr3_nt) if lcommon.nt2aa(cdr3_nt) != cdr3_aa: print cdr3_nt print lcommon.nt2aa(cdr3_nt) print cdr3_aa assert lcommon.nt2aa(cdr3_nt) == cdr3_aa event = lclone.Cdr3Clone(1, cdr3_nt, v, j, d=self.d, aa=cdr3_aa, vdel=vdel, jdel=jdel, d5del=self.devent.d5del, d3del=self.devent.d3del, vdins=vd_ins, djins = dj_ins) events.append(event) if len(events) >= batchsize: outfile = "%s_%d" % (self.outfile, currbatch) pickle.dump(events, gzip.open(outfile, 'wb')) currbatch += 1 events = [] if len(events) > 0: outfile = "%s_%d" % (self.outfile, currbatch) pickle.dump(events, gzip.open(outfile, 'wb'))
def find_min_vdel(v_nt, cdr3_aa): # find the minimum number of V nucleotides need to be deleted to # result in the CDR3aa v_aa = lcommon.nt2aa(v_nt) matchseq = left_max_match(v_aa, cdr3_aa) # max matched aa seq len_ntmatch = len(matchseq) * 3 min_vdel = len(v_nt) - len_ntmatch # see if 1 or 2 of the next v nts can result in the next cdr3 aa if min_vdel > 0 and len(cdr3_aa) > len(matchseq): if min_vdel >= 2: v_nts = v_nt[len_ntmatch: len_ntmatch + 2] else: v_nts = v_nt[len_ntmatch] got2nts = False firstaa = cdr3_aa[len(matchseq)] codons = lcommon.aa2codons(firstaa) for codon in codons: if re.match(v_nts, codon): min_vdel -= len(v_nts) got2nts = True break if len(v_nts) == 2 and not got2nts: for codon in codons: if re.match(v_nts[0], codon): min_vdel -= 1 break return min_vdel
def run(self): total_llh = 0.0 for vjins in self.vjins_nts: assert len(vjins) >= 2 # because it has the last v and the first j for mobj in re.finditer(self.d_nts, vjins): # check special case v_hang_right = 0 if self.v_hang == 0 else 3 - self.v_hang if len(self.d_nts) == 3: if (mobj.start() - v_hang_right - 1) % 3 == 0: continue elif len(self.d_nts) == 4: if (mobj.start() - v_hang_right - 1) != 1: continue vdins = vjins[: mobj.start()] djins = vjins[mobj.end(): ] cdr3_nt = self.v_cdr3_nt + vjins[1:-1] + self.j_cdr3_nt assert lcommon.nt2aa(cdr3_nt) == self.cdr3_aa event = lclone.Cdr3Clone(1, cdr3_nt, self.v, self.j, d=self.d, aa=self.cdr3_aa, vdel=self.vdel, jdel=self.jdel, d5del=self.d5del, d3del=self.d3del, vdins=vdins, djins = djins) llh = ntclone_likelihood(event, self.model) # addjust empty D llh (TEMPORARY HACK): if self.d_nts == '': if self.d == 'TRBD1': llh += log10(13) elif self.d == 'TRBD2': llh += log10(17) total_llh += 10 ** llh pickle.dump(total_llh, gzip.open(self.outfile, 'wb'))
def find_min_jdel(j_nt, cdr3_aa): # find the minimum number of J nucleotides need to be deleted to # result in the CDR3aa hang_j = len(j_nt) % 3 hang_j_nts = j_nt[: hang_j] # the 5' of j nt seq that not part of a codon j_aa = lcommon.nt2aa(j_nt[hang_j:]) matchseq = right_max_match(j_aa, cdr3_aa) len_ntmatch = len(matchseq) * 3 min_jdel = len(j_nt) - len_ntmatch if min_jdel > 0 and len(cdr3_aa) > len(matchseq): if min_jdel >= 2: j_nts = j_nt[min_jdel - 2: min_jdel] else: j_nts = j_nt[min_jdel - 1] got2nts = False firstaa = cdr3_aa[len(cdr3_aa) - len(matchseq) - 1] codons = lcommon.aa2codons(firstaa) for codon in codons: if codon[-1 * len(j_nts): ] == j_nts: min_jdel -= len(j_nts) got2nts = True break if len(j_nts) == 2 and not got2nts: for codon in codons: if codon[2] == j_nts[1]: min_jdel -= 1 break return min_jdel
def run(self): items = self.clone.split('_') v = items[0] j = items[2] events = [] batchsize = 100000 currbatch = 0 for vdel in xrange(self.min_vdel, self.max_vdel + 1): v_cdr3_nt = self.v_nt if vdel == 0 else self.v_nt[: -1 * vdel] v_hang = len(v_cdr3_nt) % 3 for jdel in xrange(self.min_jdel, self.max_jdel + 1): j_cdr3_nt = self.j_nt if jdel == 0 else self.j_nt[jdel: ] d_nts = self.devent.left_nts + self.devent.right_nts vjins_nts = get_vjins_emptyd(self.v_nt, vdel, self.j_nt, jdel, d_nts, self.cdr3_aa) if vjins_nts is None: continue #self.logToMaster("Empty D: vdel: %d, jdel: %d, vjins: %d\n" % (vdel, jdel, len(vjins_nts))) for vjins in vjins_nts: assert len(vjins) >= 2 # because it has the last v and the first j for mobj in re.finditer(d_nts, vjins): start = mobj.start() end = mobj.end() # check special case v_hang_right = 0 if v_hang == 0 else 3 - v_hang if len(d_nts) == 3: if (start - v_hang_right - 1) % 3 == 0: continue elif len(d_nts) == 4: if (start - v_hang_right - 1) != 1: continue vdins = vjins[: start] djins = vjins[end: ] #assert len(vdins) + len(djins) + len(d_nts) == len(vjins) cdr3_nt = v_cdr3_nt + vjins[1:-1] + j_cdr3_nt assert lcommon.nt2aa(cdr3_nt) == self.cdr3_aa event = lclone.Cdr3Clone(1, cdr3_nt, v, j, d=self.d, aa=self.cdr3_aa, vdel=vdel, jdel=jdel, d5del=self.devent.d5del, d3del=self.devent.d3del, vdins=vdins, djins=djins) events.append(event) if len(events) >= batchsize: outfile = "%s_%d" % (self.outfile, currbatch) pickle.dump(events, gzip.open(outfile, 'wb')) currbatch += 1 events = [] if len(events) > 0: outfile = "%s_%d" % (self.outfile, currbatch) pickle.dump(events, gzip.open(outfile, 'wb'))
def find_devents2(d_nt, cdr3_aa, dels2freq): # find all possible (d5del, d3del) such that the resulted nt can be # translated to a subsequence of cdr3_aa devents = [] dlen = len(d_nt) for d5del in xrange(dlen + 1): for d3del in xrange(dlen - d5del): dels = (d5del, d3del) if dels not in dels2freq or dels2freq[dels] == 0: continue d_leftover_nt = d_nt[d5del: dlen - d3del] dlen2 = len(d_leftover_nt) if dlen2 == 0: if d5del == dlen / 2: event = Devent(d5del, d3del, '', -1, -1, '') devents.append(event) elif dlen2 < 3: event = Devent(d5del, d3del, d_leftover_nt, -1, -1, '') devents.append(event) #for left in xrange(dlen2 + 1): # left_nts = d_leftover_nt[: left] # right_nts = d_leftover_nt[left: ] # event = Devent(d5del, d3del, left_nts, -1, -1, right_nts) # devents.append(event) else: for left in [0, 1, 2]: # each translation frame d_aa = lcommon.nt2aa(d_leftover_nt[left:]) # check to see if d_aa match cdr3_aa if d_aa: # not empty sequence matches = find_dmatches(d_aa, cdr3_aa) if len(matches) > 0: right = left + len(d_aa) * 3 left_nts = d_leftover_nt[:left] right_nts = d_leftover_nt[right: ] for match in matches: event = Devent(d5del, d3del, left_nts, match[0], match[1], right_nts) devents.append(event) if len(d_leftover_nt) in [3, 4]: event = Devent(d5del, d3del, d_leftover_nt, -1, -1, '') devents.append(event) return devents
def run(self): #events = [] #event_file = "%s_events" % self.outfile total_llh = 0.0 model = pickle.load(gzip.open(self.modelfile, 'rb')) vd_ins_nts = pickle.load(gzip.open(self.vfile, 'rb')) dj_ins_nts = pickle.load(gzip.open(self.jfile, 'rb')) #self.logToMaster("vdel: %d; vd_ins_nts: %d" % (self.vdel, len(vd_ins_nts))) #self.logToMaster("jdel: %d; dj_ins_nts: %d" % (self.jdel, len(dj_ins_nts))) for vd_ins in vd_ins_nts: for dj_ins in dj_ins_nts: cdr3_nt = (self.v_cdr3_nt + vd_ins[1: ] + self.d_cdr3_nt + dj_ins[: -1] + self.j_cdr3_nt) assert lcommon.nt2aa(cdr3_nt) == self.cdr3_aa event = lclone.Cdr3Clone(1, cdr3_nt, self.v, self.j, d=self.d, aa=self.cdr3_aa, vdel=self.vdel, jdel=self.jdel, d5del=self.d5del, d3del=self.d3del, vdins=vd_ins, djins = dj_ins) #events.append(event) llh = ntclone_likelihood(event, model) total_llh += 10 ** llh pickle.dump(total_llh, gzip.open(self.outfile, 'wb'))
def adaptive_parseline(line, index2col): items = line.strip("\n").split("\t") if len(items) != len(index2col): sys.stderr.write( "Incosistent number of columns between the following\ line and the header line, skipped it:\n\ Line:\n%s\n" % line ) return None col2val = {} valid_cols = adaptive_columns() for i, col in index2col.iteritems(): if col in valid_cols: col2val[col] = items[i].replace("/", ", ") # Return None if line does not have minimum required fields. required_cols = [ "count", "frequencyCount", "nucleotide", "vGeneName", "jGeneName", "vGeneNameTies", "jGeneNameTies", ] for c in required_cols: if c not in col2val: # or col2val[c] in ["(undefined)", ""]: return None count = int(col2val["count"]) freq = float(col2val["frequencyCount"]) / 100.0 # convert to non percentage nuc = col2val["nucleotide"] vgene = col2val["vGeneName"] if vgene == "unresolved": vgenes = col2val["vGeneNameTies"].split(",") else: vgenes = [vgene] jgene = col2val["jGeneName"] if jgene == "unresolved": jgenes = col2val["jGeneNameTies"].split(",") else: jgenes = [jgene] # Clone with required fields clone = Clone(count, freq, nuc, vgenes, jgenes) # Additional information if available # Gene info: if "dGeneName" in col2val: dgenestr = col2val["dGeneName"] if dgenestr == "unresolved": clone.dgenes = col2val["dGeneNameTies"].split(",") else: clone.dgenes = [dgenestr] if "sequenceStatus" in col2val: status = col2val["sequenceStatus"].lower() if status is not None and status == "in": clone.productive = True elif status == "out" or status == "stop": clone.productive = False else: sys.stderr.write("Unknown status: %s\n" % status) if "aminoAcid" in col2val: clone.cdr3aa = col2val["aminoAcid"] # Junctional info: offset = 0 if "vIndex" in col2val: vindex = int(col2val["vIndex"]) if clone.productive: # Make sure nuc is inframe: offset = vindex % 3 nuclen = len(clone.nuc) endoffset = (nuclen - offset) % 3 clone.nuc = clone.nuc[offset : nuclen - endoffset] clone.aa = libcommon.nt2aa(clone.nuc) if clone.cdr3aa: cdr3len = len(clone.cdr3aa) * 3 endindex = max(len(clone.nuc), vindex + cdr3len) clone.cdr3nuc = clone.nuc[vindex:endindex] if "dIndex" in col2val: clone.firstdpos = int(col2val["dIndex"]) - offset if "n2Index" in col2val: n2index = int(col2val["n2Index"]) if n2index != -1: clone.lastvpos = n2index - 1 - offset elif clone.firstdpos: # No d5ins clone.lastvpos = clone.firstdpos - 1 if "jIndex" in col2val: clone.firstjpos = int(col2val["jIndex"]) - offset if "n1Index" in col2val: n1index = int(col2val["n1Index"]) if n1index != -1: clone.lastdpos = n1index - 1 - offset elif clone.firstjpos: # No d3ins clone.lastdpos = clone.firstjpos - 1 # Deletion info: if "vDeletion" in col2val: clone.vdel = int(col2val["vDeletion"]) if "d5Deletion" in col2val: clone.d5del = int(col2val["d5Deletion"]) if "d3Deletion" in col2val: clone.d3del = int(col2val["d3Deletion"]) if "jDeletion" in col2val: clone.jdel = int(col2val["jDeletion"]) return clone