def mitcr_parseline(line, index2col): items = line.strip().split('\t') if len(items) != len(index2col): sys.stderr.write("Inconsistent number of columns between the following\ line and the header line, skipped it:\n\ Line:\n%s\n" % line) return None col2val = {} valid_cols = mitcr_columns() for i, col in index2col.iteritems(): if col in valid_cols: col2val[col] = items[i] # Return None if line does not have minimum required fields. required_cols = ["Read count", "Percentage", "CDR3 nucleotide sequence", "V segments", "J segments"] for c in required_cols: if c not in col2val or not col2val[c]: return None count = int(col2val['Read count']) freq = float(col2val['Percentage'])/100.0 nuc = col2val['CDR3 nucleotide sequence'] vgenes = col2val['V segments'].split(', ') jgenes = col2val['J segments'].split(', ') clone = Clone(count, freq, nuc, vgenes, jgenes, cdr3nuc=nuc) clone.productive = True # Assuming MiTCR only output productive clones if 'D segments' in col2val: clone.dgenes = col2val['D segments'].split(', ') if 'V alleles' in col2val: clone.valleles = col2val['V alleles'].split(', ') if 'J alleles' in col2val: clone.jalleles = col2val['J alleles'].split(', ') if 'D alleles' in col2val: clone.dalleles = col2val['D alleles'].split(', ') if 'CDR3 amino acid sequence' in col2val: clone.aa = col2val['CDR3 amino acid sequence'] clone.cdr3aa = col2val['CDR3 amino acid sequence'] if 'Last V nucleotide position' in col2val: clone.lastvpos = int(col2val['Last V nucleotide position']) if 'First D nucleotide position' in col2val: clone.firstdpos = int(col2val['First D nucleotide position']) if 'Last D nucleotide position' in col2val: clone.lastdpos = int(col2val['Last D nucleotide position']) if 'First J nucleotide position' in col2val: clone.firstjpos = int(col2val['First J nucleotide position']) return clone
def sequenta_parseline(line, index2col): items = line.strip("\n").split("\t") if len(items) != len(index2col): sys.stderr.write( "Incosistent number of columns between the following\ line and the header line, skipped it:\n\ Line:\n%s\n" % line ) return None col2val = {} valid_cols = sequenta_columns() for i, col in index2col.iteritems(): if col in valid_cols: col2val[col] = items[i] # Return None if clone is "Water" if "Patient" in col2val and col2val["Patient"] == "Water": return None # Return None if line does not have minimum required fields. required_cols = [ "Total_Read_Count", "Log10_Frequency", "Clone_Sequence", "V_Segment_Major_Gene", "J_Segment_Major_Gene", ] for c in required_cols: if c not in col2val or col2val[c] in ["NAN", "", "-"]: return None count = libcommon.soft_int(col2val["Total_Read_Count"]) try: freq = 10 ** float(col2val["Log10_Frequency"]) except: # Return None if clone does not have a valid frequency return None nuc = col2val["Clone_Sequence"] vgenes = col2val["V_Segment_Major_Gene"].split("; ") jgenes = col2val["J_Segment_Major_Gene"].split("; ") # Clone with required fields clone = Clone(count, freq, nuc, vgenes, jgenes) # Additional information if available # Gene info: if "D_Segment_Major_Allele" in col2val: dstr = col2val["D_Segment_Major_Allele"] if dstr not in ["NAN", "", "-"]: dalleles = dstr.split("; ") dgenes = [] for d in dalleles: dgene = d.split("*")[0] if dgene not in dgenes: dgenes.append(dgene) clone.dgenes = dgenes clone.dalleles = dalleles if not clone.dgenes: # no dgenes info jgroups = get_j_groups(clone.jgenes) if ["1"] == jgroups: clone.dgenes = ["TRBD1"] else: clone.dgenes = [random.choice(["TRBD1", "TRBD2"])] if "V_Segment_Major_Allele" in col2val: clone.valleles = col2val["V_Segment_Major_Allele"].split("; ") if "J_Segment_Major_Allele" in col2val: clone.jalleles = col2val["J_Segment_Major_Allele"].split("; ") # Sequence ID, status and cdr3aa: if "Sample" in col2val: clone.samplename = col2val["Sample"] if "Patient" in col2val: clone.patient = col2val["Patient"] if "Clone_Index" in col2val: clone.id = col2val["Clone_Index"] if "Is_Good_Frame" in col2val: if col2val["Is_Good_Frame"].lower() == "true": clone.productive = True else: clone.productive = False if "Clone_Protein_Sequence" in col2val: clone.aa = col2val["Clone_Protein_Sequence"].replace("*", "Z") offset = 0 if "CDR3_Sense_Sequence" in col2val: clone.cdr3nuc = col2val["CDR3_Sense_Sequence"] if not re.search(clone.cdr3nuc, clone.nuc): clone.nuc = libcommon.rc(clone.nuc) try: cdr3aa = sequenta_getaa(clone.cdr3nuc) clone.cdr3aa = cdr3aa except: # return None if cannot translate cdr3nuc return None # Make sure nuc is in frame cdr3start = re.search(clone.cdr3nuc, clone.nuc).start() offset = cdr3start % 3 nuclen = len(clone.nuc) endoffset = (nuclen - offset) % 3 clone.nuc = clone.nuc[offset : nuclen - endoffset] # Junctional info: if "V_Segment_Extension_Length" in col2val: vins = libcommon.soft_int(col2val["V_Segment_Extension_Length"]) clone.lastvpos = vins - 1 - offset if "N_Bases_adjacent_V" in col2val: d5ins = col2val["N_Bases_adjacent_V"] if not d5ins.startswith("-") and d5ins not in ["", "NAN"]: clone.firstdpos = clone.lastvpos + int(d5ins) + 1 if "J_Segment_Extension_Length" in col2val: jins = libcommon.soft_int(col2val["J_Segment_Extension_Length"]) clone.firstjpos = len(clone.nuc) - jins if "N_Bases_adjacent_J" in col2val: d3ins = col2val["N_Bases_adjacent_J"] if not d3ins.startswith("-") and d3ins not in ["", "NAN"]: clone.lastdpos = clone.firstjpos - int(d3ins) - 1 # Deletions: if "V_Segment_Deletion_Length" in col2val: vdel = col2val["V_Segment_Deletion_Length"] if not vdel.startswith("-") and vdel not in ["", "NAN"]: clone.vdel = libcommon.soft_int(vdel) if "J_Segment_Deletion_Length" in col2val: jdel = col2val["J_Segment_Deletion_Length"] if not jdel.startswith("-") and jdel not in ["", "NAN"]: clone.jdel = libcommon.soft_int(jdel) # Special treatment for D info: d2fulllen = {"TRBD1": 12, "TRBD2": 16} if "D_Segment_length" in col2val: dgene = clone.dgenes[0] dfulllen = d2fulllen[dgene] dlen = col2val["D_Segment_length"] if not dlen.startswith("-") and dlen not in ["", "NAN"]: ddel = dfulllen - int(dlen) clone.d5del, clone.d3del = get_ddels(ddel) # clone.d5del = ddel / 2.0 # clone.d3del = ddel - clone.d5del else: # all D was deleted clone.d5del, clone.d3del = get_ddels(dfulllen) # clone.d5del = dfulllen / 2 # clone.d3del = dfulllen - clone.d5del ndn = clone.firstjpos - clone.lastvpos clone.firstdpos = clone.lastvpos + ndn / 2 + 1 clone.lastdpos = clone.firstdpos - 1 return clone