def get_vrc01class_germline_lights(clone=VK320): vk3_20_gl = ('IGKV3-20', 'EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGSSP') vk1_33_gl = ('IGKV1-33', 'DIQMTQSPSSLSASVGDRVTITCQASQDISNYLNWYQQKPGKAPKLLIYDASNLETGVPSRFSGSGSGTDFTFTISSLQPEDIATYYCQQYDNLP') vl2_14_gl = ('IGLV2-14', 'QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYEVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSSTL') vk1_5_gl = ('IGKV1-5', 'DIQMTQSPSTLSASVGDRVTITCRASQSISSWLAWYQQKPGKAPKLLIYDASSLESGVPSRFSGSGSGTEFTLTISSLQPDDFATYYCQQYNSYS') all = [('IGKV3-20', 'EIVLTQSPGTLSLSPGERATLSCRASQSVSSSYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDFAVYYCQQYGSSP'), ('IGKV1-33', 'DIQMTQSPSSLSASVGDRVTITCQASQDISNYLNWYQQKPGKAPKLLIYDASNLETGVPSRFSGSGSGTDFTFTISSLQPEDIATYYCQQYDNLP'), ('IGLV2-14', 'QSALTQPASVSGSPGQSITISCTGTSSDVGGYNYVSWYQQHPGKAPKLMIYEVSNRPSGVSNRFSGSKSGNTASLTISGLQAEDEADYYCSSYTSSSTL') ] if clone == VK320: lights = sequence.Sequence(vk3_20_gl) elif clone == VK133: lights = sequence.Sequence(vk1_33_gl) elif clone == VL214: lights = sequence.Sequence(vl2_14_gl) elif clone == VK15: lights = sequence.Sequence(vk1_5_gl) else: lights = [sequence.Sequence(s) for s in all] return lights
def get_vrc01_class_sequences(chain='heavy', vgene_only=True, only_include=None): if vgene_only: heavy = [('VRC01', 'QVQLVQSGGQMKKPGESMRISCRASGYEFIDCTLNWIRLAPGKRPEWMGWLKPRGGAVNYARPLQGRVTMTRDVYSDTAFLELRSLTVDDTAVYFCTR'), ('PGV04', 'QVQLVQSGSGVKKPGASVRVSCWTSEDIFERTELIHWVRQAPGQGLEWIGWVKTVTGAVNFGSPDFRQRVSLTRDRDLFTAHMDIRGLTQGDTATYFCAR'), ('VRC-CH31', 'QVQLVQSGAAVRKPGASVTVSCKFAEDDDYSPYWVNPAPEHFIHFLRQAPGQQLEWLAWMNPTNGAVNYAWYLNGRVTATRDRSMTTAFLEVKSLRSDDTAVYYCAR'), ('3BNC60', 'QVHLSQSGAAVTKPGASVRVSCEASGYKISDHFIHWWRQAPGQGLQWVGWINPKTGQPNNPRQFQGRVSLTRQASWDFDTYSFYMDLKAVRSDDTAIYFCAR'), ('12A12', 'HLVQSGTQVKKPGASVRISCQASGYSFTDYVLHWWRQAPGQGLEWMGWIKPVYGARNYARRFQGRINFDRDIYREIAFMDLSGLRSDDTALYFCAR'), ('PGV20', 'QVHLMQSGTEMKKPGASVRVTCQTSGYTFSDYFIHWLRQVPGRGFEWMGWMNPQWGQVNYARTFQGRVTMTRDVYREVAYLDLRSLTFADTAVYFCAR')] light = [] else: heavy = [('VRC01', 'QVQLVQSGGQMKKPGESMRISCRASGYEFIDCTLNWIRLAPGKRPEWMGWLKPRGGAVNYARPLQGRVTMTRDVYSDTAFLELRSLTVDDTAVYFCTRGKNCDYNWDFEHWGRGTPVIVSS'), ('PGV04', 'QVQLVQSGSGVKKPGASVRVSCWTSEDIFERTELIHWVRQAPGQGLEWIGWVKTVTGAVNFGSPDFRQRVSLTRDRDLFTAHMDIRGLTQGDTATYFCARQKFYTGGQGWYFDLWGRGTLIVVSS'), ('VRC-CH31', 'QVQLVQSGAAVRKPGASVTVSCKFAEDDDYSPYWVNPAPEHFIHFLRQAPGQQLEWLAWMNPTNGAVNYAWYLNGRVTATRDRSMTTAFLEVKSLRSDDTAVYYCARAQKRGRSEWAYAHWGQGTPVVVSS'), ('3BNC60', 'QVHLSQSGAAVTKPGASVRVSCEASGYKISDHFIHWWRQAPGQGLQWVGWINPKTGQPNNPRQFQGRVSLTRQASWDFDTYSFYMDLKAVRSDDTAIYFCARQRSDFWDFDVWGSGTQVTVSS'), ('12A12', 'HLVQSGTQVKKPGASVRISCQASGYSFTDYVLHWWRQAPGQGLEWMGWIKPVYGARNYARRFQGRINFDRDIYREIAFMDLSGLRSDDTALYFCARDGSGDDTSWHLDPWGQGTLVIVSA'), ('PGV20', 'QVHLMQSGTEMKKPGASVRVTCQTSGYTFSDYFIHWLRQVPGRGFEWMGWMNPQWGQVNYARTFQGRVTMTRDVYREVAYLDLRSLTFADTAVYFCARRMRSQDREWDFQHWGQGTRIIVSS')] light = [] seqs = heavy if chain == 'heavy' else light if only_include is not None: if type(only_include) in [str, unicode]: only_include = [only_include, ] seqs = [s for s in seqs if s[0] in only_include] return [sequence.Sequence(s) for s in seqs]
def vrc01_class_mutation_count(seqs, vgene_only=True): input_seqs = [sequence.Sequence([s['seq_id'], s['vdj_aa']]) for s in seqs] shared = [] total = [] # get VRC01-class sequences vrc01_seqs = get_vrc01_class_sequences(vgene_only=vgene_only) vrc01_names = [s.id for s in vrc01_seqs] # get glVRC01 sequence glvrc01 = get_vrc01_germline_sequence(vgene_only=vgene_only) glvrc01_name = glvrc01.id import re regex = re.compile("[a-zA-Z]") # identify VRC01-class mutations for s in input_seqs: alignment_seqs = [s] + vrc01_seqs + [glvrc01] aln = muscle(alignment_seqs) aln_seq = [seq for seq in aln if seq.id == s.id][0] aln_gl = [seq for seq in aln if seq.id == glvrc01_name][0] aln_vrc01s = [seq for seq in aln if seq.id in vrc01_names] # Strip '-' off the front of strings based on input string # match = re.search(regex,str(aln_seq.seq)) index = [m.start() for m in re.finditer(regex, str(aln_seq.seq))] # Logic if no match is found... if (len(index) > 1): aln_seq.seq = aln_seq.seq[index[0]:(index[-1]+1)] aln_gl.seq = aln_gl.seq[index[0]:(index[-1]+1)] for p in aln_vrc01s: p.seq = p.seq[index[0]:(index[-1]+1)] # Count mutations _total = sum([_s != g for _s, g in zip(str(aln_seq.seq), str(aln_gl.seq)) if g != '-']) total.append(_total) all_shared = {} for vrc01 in aln_vrc01s: _shared = [] for q, g, v in zip(str(aln_seq.seq), str(aln_gl.seq), str(vrc01.seq)): if g == '-' and v == '-': _shared.append(False) elif q == v and q != g: _shared.append(True) else: _shared.append(False) all_shared[vrc01.id] = _shared any_shared = 0 for pos in zip(*all_shared.values()): if any(pos): any_shared += 1 shared.append(any_shared) # print("Seq: %20s, Total: %2d, Shared: %2d" % (str(aln_seq.id), _total, any_shared)) # print("Seq: "+str(aln_seq.id)+" Total: "+str(_total)+", Shared: "+str(any_shared)) return shared, total
def get_vrc01_germline_sequence(vgene_only=True): if vgene_only: # gl_vrc01 = ('glVRC01', 'QVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYMHWVRQAPGQGLEWMGWINPNSGGTNYAQKFQGRVTMTRDTSISTAYMELSRLRSDDTAVYYCAR') # Hack to get to work with JH sequences. Issue is that alignment is stuid sometimes at the N-term gl_vrc01 = ('glVRC01', 'PGASVKVSCKASGYTFTGYYMHWVRQAPGQGLEWMGWINPNSGGTNYAQKFQGRVTMTRDTSISTAYMELSRLRSDDTAVYYCAR') else: gl_vrc01 = ('glVRC01', 'QVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYMHWVRQAPGQGLEWMGWINPNSGGTNYAQKFQGRVTMTRDTSISTAYMELSRLRSDDTAVYYCARGKNSDYNWDFQHWGQGTLVTVSS') return sequence.Sequence(gl_vrc01)
def vrc01_class_mutation_count(seqs, vgene_only=True): input_seqs = [sequence.Sequence([s['seq_id'], s['vdj_aa']]) for s in seqs] # get VRC01-class sequences if (not expanded): vrc01_seqs = get_vrc01_class_sequences(vgene_only=vgene_only) else: vrc01_seqs = get_expanded_vrc01_class_sequences(vgene_only=vgene_only) # get glVRC01 sequence glvrc01 = get_vrc01_germline_sequence(vgene_only=vgene_only) return identifyvrc01muts(input_seqs, vrc01_seqs, glvrc01, VH12)
def get_expanded_vrc01_class_sequences(chain='heavy', vgene_only=True, only_include=None): heavy = [('VRC01', 'QVQLVQSGGQMKKPGESMRISCRASGYEFIDCTLNWIRLAPGKRPEWMGWLKPRGGAVNYARPLQGRVTMTRDVYSDTAFLELRSLTVDDTAVYFCTR'), ('PGV04', 'QVQLVQSGSGVKKPGASVRVSCWTSEDIFERTELIHWVRQAPGQGLEWIGWVKTVTGAVNFGSPDFRQRVSLTRDRDLFTAHMDIRGLTQGDTATYFCAR'), ('VRC-CH31', 'QVQLVQSGAAVRKPGASVTVSCKFAEDDDYSPYWVNPAPEHFIHFLRQAPGQQLEWLAWMNPTNGAVNYAWYLNGRVTATRDRSMTTAFLEVKSLRSDDTAVYYCAR'), ('3BNC60', 'QVHLSQSGAAVTKPGASVRVSCEASGYKISDHFIHWWRQAPGQGLQWVGWINPKTGQPNNPRQFQGRVSLTRQASWDFDTYSFYMDLKAVRSDDTAIYFCAR'), ('12A12', 'HLVQSGTQVKKPGASVRISCQASGYSFTDYVLHWWRQAPGQGLEWMGWIKPVYGARNYARRFQGRINFDRDIYREIAFMDLSGLRSDDTALYFCAR'), ('PGV20', 'QVHLMQSGTEMKKPGASVRVTCQTSGYTFSDYFIHWLRQVPGRGFEWMGWMNPQWGQVNYARTFQGRVTMTRDVYREVAYLDLRSLTFADTAVYFCAR'), ('PCIN63-71Ja','QVQLVQSGAEVKKPGASVRVSCKASGYTFNSCLIHWWRQAPGQGLQWMAWINPLHGAVNYAHQFQGRITVTRDTSIDTAYMELRGLRSDDTATYYCTR')] light = [] seqs = heavy if chain == 'heavy' else light if only_include is not None: if type(only_include) in [str, unicode]: only_include = [only_include, ] seqs = [s for s in seqs if s[0] in only_include] return [sequence.Sequence(s) for s in seqs]
def vrc01_class_mutation_count_light_chain(seqs, vgene_only=True, vgene=VK320): input_seqs = [sequence.Sequence([s['seq_id'], s['vdj_aa']]) for s in seqs] shared = [] total = [] # get VRC01-class sequences if(not expanded): vrc01_seqs = get_vrc01_class_sequences(chain=vgene, vgene_only=vgene_only) else: vrc01_seqs = get_expanded_vrc01_class_sequences(vgene_only=vgene_only) # get glVRC01 sequence glvrc01 = get_vrc01class_germline_lights(vgene) return identifyvrc01muts(input_seqs, vrc01_seqs, glvrc01, vgene)
def vrc01_class_mutation_positions(seqs, vgene_only=True): data = [] input_seqs = [sequence.Sequence([s['seq_id'], s['vdj_aa']]) for s in seqs] input_names = [s.id for s in input_seqs] # get VRC01-class sequences if (not expanded): hiv_seqs = get_vrc01_class_sequences() else: hiv_seqs = get_expanded_vrc01_class_sequences() all_hiv_names = [s.id for s in hiv_seqs] # MSA seqs_for_alignment = input_seqs + hiv_seqs seqs_for_alignment.append( get_vrc01_germline_sequence(vgene_only=vgene_only)) aln = muscle(seqs_for_alignment) aln_seqs = [seq for seq in aln if seq.id in input_names] aln_gl = [seq for seq in aln if seq.id == 'glVRC01'][0] aln_mins = [seq for seq in aln if seq.id in ['minVRC01', 'min12A21']] aln_hiv = [seq for seq in aln if seq.id in all_hiv_names] for seq in aln_seqs: seq_data = [] for i, (s, g) in enumerate(zip(str(seq.seq), str(aln_gl.seq))): # if g == '-' and s == '-': if g == '-': continue min_residues = [seq[i] for seq in aln_mins] vrc01_residues = [seq[i] for seq in aln_hiv] if s == '-': seq_data.append(0) elif s == g: seq_data.append(0) elif s != g and s in min_residues: seq_data.append(2) elif s != g and s in vrc01_residues: seq_data.append(3) elif s != g and s not in vrc01_residues: seq_data.append(1) else: seq_data.append(0) data.append(np.asarray(seq_data)) return np.asarray(data)
def _schief_output_line(seq, legacy, pairisvrc01class=False, s=None, t=None, m=None): if seq is None: return [''] * 34 line = [] line.append(seq.fraction) line.append(seq.confidence) line.append(seq['v_gene']['gene']) line.append(_get_alternates(seq['v_gene']['others'], j_gene=False)) if seq['chain'] == 'heavy': line.append(seq['d_gene']['gene'] if 'd_gene' in seq else '') line.append( _get_alternates(seq['d_gene']['others'], j_gene=False ) if 'd_gene' in seq else '') line.append(seq['j_gene']['gene']) line.append(_get_alternates(seq['j_gene']['others'], j_gene=True)) line.append(seq['cdr3_len'] if 'cdr3_len' in seq else '') line.append(seq['cdr3_nt'].upper() if 'cdr3_nt' in seq else '') line.append(seq['cdr3_aa'].upper() if 'cdr3_aa' in seq else '') line.append(seq['junc_aa']) line.append(seq['junc_nt']) line.append('|'.join( str(i['was']) + str(i['position']) + str(i['is']) for i in seq['var_muts_nt']['muts']) if 'var_muts_nt' in seq else '') line.append('|'.join( str(i['was']) + str(i['position']) + str(i['is']) for i in seq['var_muts_aa']['muts']) if 'var_muts_aa' in seq else '') if seq['chain'] == 'heavy': line.append('Data Not Available') line.append('Data Not Available') # D Mutations are currently not avaialbe in Abstar # line.append('|'.join(str(i['was']) + str(i['position']) + str(i['is']) for i in seq['d_muts_nt']['muts']) if 'd_muts_nt' in seq else '') # line.append('|'.join(str(i['was']) + str(i['position']) + str(i['is']) for i in seq['d_muts_aa']['muts']) if 'd_muts_aa' in seq else '') line.append('|'.join( str(i['was']) + str(i['position']) + str(i['is']) for i in seq['join_muts_nt']['muts']) if 'join_muts_nt' in seq else '') line.append('|'.join( str(i['was']) + str(i['position']) + str(i['is']) for i in seq['join_muts_aa']['muts']) if 'join_muts_aa' in seq else '') line.append(100. - seq['nt_identity']['v']) line.append(_get_fr_identity(seq, res='nt')) line.append(100. - seq['aa_identity']['v']) line.append(_get_fr_identity(seq, res='aa')) if seq['chain'] == 'heavy': if pairisvrc01class: justvgene = seq['v_gene']['aa_sequence'] name = seq['seq_id'] trimmed = sequence.Sequence(justvgene) trimmed['seq_id'] = name trimmed['vdj_aa'] = justvgene vrc01_class, total = vrc01_class_mutation_count([trimmed], vgene_only=True) muts = vrc01_class_mutation_positions([trimmed], vgene_only=True) s.extend(vrc01_class) t.extend(total) m.extend(muts) line.append(vrc01_class[0]) else: line.append('') line.append('yes' if 'v_ins' in seq else '') line.append('yes' if 'v_del' in seq else '') line.append(seq['vdj_aa']) line.append(seq['vdj_nt']) if 'v_ins' in seq: len_field = 'len' if legacy else 'length' line.append(len(seq['v_ins'])) line.append('[' + ' '.join([ str(i[len_field]) + ":" + str(i['position']) for i in seq['v_ins'] ]) + ']') else: line.append('0') line.append('') if 'v_del' in seq: len_field = 'len' if legacy else 'length' line.append(len(seq['v_del'])) line.append('[' + ' '.join([ str(i[len_field]) + ":" + str(i['position']) for i in seq['v_del'] ]) + ']') else: line.append('0') line.append('') line.append(seq['vdj_aa'].upper().count('C') if 'vdj_aa' in seq else '') line.append(seq['cdr3_aa'].upper().count('C') if 'cdr3_aa' in seq else '') return line
num_of_tokens = len(tokens) if num_of_tokens != 3: return dict(cluster_fraction=tokens[0]) else: return dict(cluster_fraction=tokens[1], cluster_confidence=tokens[2]) seqs = [] basename = os.path.splitext(os.path.basename(sys.argv[1]))[0] with open(sys.argv[1]) as f: for line in f: if line.strip() == "": continue d = json.loads(line.strip()) # Bryan has a pair object seq = sequence.Sequence(d) seqs.append(seq) force_all_heavy_as_vrc01class = False if (len(sys.argv) > 2): if sys.argv[2] == "forcevrc01": force_all_heavy_as_vrc01class = True colortouse = '#45bc70' if (len(sys.argv) > 3): if sys.argv[3] == "orange": colortouse = '#f99248' else: colortouse = str(sys.argv[3]).strip() # munge the ids and create a dictionary from which we construct
def get_vrc01_class_sequences(chain='heavy', vgene_only=True, only_include=None): if vgene_only: heavy = [ ('VRC01', 'QVQLVQSGGQMKKPGESMRISCRASGYEFIDCTLNWIRLAPGKRPEWMGWLKPRGGAVNYARPLQGRVTMTRDVYSDTAFLELRSLTVDDTAVYFCTR' ), ('PGV04', 'QVQLVQSGSGVKKPGASVRVSCWTSEDIFERTELIHWVRQAPGQGLEWIGWVKTVTGAVNFGSPDFRQRVSLTRDRDLFTAHMDIRGLTQGDTATYFCAR' ), ('VRC-CH31', 'QVQLVQSGAAVRKPGASVTVSCKFAEDDDYSPYWVNPAPEHFIHFLRQAPGQQLEWLAWMNPTNGAVNYAWYLNGRVTATRDRSMTTAFLEVKSLRSDDTAVYYCAR' ), ('3BNC60', 'QVHLSQSGAAVTKPGASVRVSCEASGYKISDHFIHWWRQAPGQGLQWVGWINPKTGQPNNPRQFQGRVSLTRQASWDFDTYSFYMDLKAVRSDDTAIYFCAR' ), ('12A12', 'HLVQSGTQVKKPGASVRISCQASGYSFTDYVLHWWRQAPGQGLEWMGWIKPVYGARNYARRFQGRINFDRDIYREIAFMDLSGLRSDDTALYFCAR' ), ('PGV20', 'QVHLMQSGTEMKKPGASVRVTCQTSGYTFSDYFIHWLRQVPGRGFEWMGWMNPQWGQVNYARTFQGRVTMTRDVYREVAYLDLRSLTFADTAVYFCAR' ) ] vk3_20_light = [ ('VRC01', 'EIVLTQSPGTLSLSPGETAIISCRTSQYGSLAWYQQRPGQAPRLVIYSGSTRAAGIPDRFSGSRWGPDYNLTISNLESGDFGVYYCQQYEFFGQGTKVQVDIKR' ), ('PGV04', 'EIVLTQSPGTLSLSPGETASLSCTAASYGHMTWYQKKPGQPPKLLIFATSKRASGIPDRFSGSQFGKQYTLTITRMEPEDFARYYCQQLEFFGQGTRLEIRR' ), ('3BNC60', 'DIQMTQSPSSLSARVGDTVTITCQANGYLNWYQQRRGKAPKLLIYDGSKLERGVPARFSGRRWGQEYNLTINNLQPEDVATYFCQVYEFIVPGTRLDLKRTVAA' ) ] vk1_33_light = [ ('VRC-CH31', 'DIQMTQSPSSLSASLGDRVTITCQASRGIGKDLNWYQQKAGKAPKLLVSDASTLEGGVPSRFSGSGFHQNFSLTISSLQAEDVATYFCQQYETFGQGTKVDIK' ), ('12A12', 'DIQMTQSPSSLSASVGDRVTITCQAGQGIGSSLQWYQQKPGKAPKLLVHGASNLHRGVPSRFSGSGFHTTFSLTISGLQRDDFATYFCAVLEFFGPGTKVEIKRTVAAPSV' ) ] vl2_14_light = [( 'PGV20', 'QSALTQPPSVSGSPGQSITLSCTGASTSVAWYQQYADKAPRLIVFDGNKRPSDISSRFSGSQSGGTASLTISGLQSEDEAYYHCNAFEFFGGGTKLTVL' )] light = [] else: heavy = [ ('VRC01', 'QVQLVQSGGQMKKPGESMRISCRASGYEFIDCTLNWIRLAPGKRPEWMGWLKPRGGAVNYARPLQGRVTMTRDVYSDTAFLELRSLTVDDTAVYFCTRGKNCDYNWDFEHWGRGTPVIVSS' ), ('PGV04', 'QVQLVQSGSGVKKPGASVRVSCWTSEDIFERTELIHWVRQAPGQGLEWIGWVKTVTGAVNFGSPDFRQRVSLTRDRDLFTAHMDIRGLTQGDTATYFCARQKFYTGGQGWYFDLWGRGTLIVVSS' ), ('VRC-CH31', 'QVQLVQSGAAVRKPGASVTVSCKFAEDDDYSPYWVNPAPEHFIHFLRQAPGQQLEWLAWMNPTNGAVNYAWYLNGRVTATRDRSMTTAFLEVKSLRSDDTAVYYCARAQKRGRSEWAYAHWGQGTPVVVSS' ), ('3BNC60', 'QVHLSQSGAAVTKPGASVRVSCEASGYKISDHFIHWWRQAPGQGLQWVGWINPKTGQPNNPRQFQGRVSLTRQASWDFDTYSFYMDLKAVRSDDTAIYFCARQRSDFWDFDVWGSGTQVTVSS' ), ('12A12', 'HLVQSGTQVKKPGASVRISCQASGYSFTDYVLHWWRQAPGQGLEWMGWIKPVYGARNYARRFQGRINFDRDIYREIAFMDLSGLRSDDTALYFCARDGSGDDTSWHLDPWGQGTLVIVSA' ), ('PGV20', 'QVHLMQSGTEMKKPGASVRVTCQTSGYTFSDYFIHWLRQVPGRGFEWMGWMNPQWGQVNYARTFQGRVTMTRDVYREVAYLDLRSLTFADTAVYFCARRMRSQDREWDFQHWGQGTRIIVSS' ) ] light = [] if chain == 'heavy': seqs = heavy elif chain == VK320: seqs = vk3_20_light elif chain == VK133: seqs = vk1_33_light elif chain == VL214: seqs = vl2_14_light else: seqs = light if only_include is not None: if type(only_include) in [str, unicode]: only_include = [ only_include, ] seqs = [s for s in seqs if s[0] in only_include] return [sequence.Sequence(s) for s in seqs]