def vrc01_class_mutation_count(seqs, vgene_only=True): input_seqs = [sequence.Sequence([s['seq_id'], s['vdj_aa']]) for s in seqs] shared = [] total = [] # get VRC01-class sequences vrc01_seqs = get_vrc01_class_sequences(vgene_only=vgene_only) vrc01_names = [s.id for s in vrc01_seqs] # get glVRC01 sequence glvrc01 = get_vrc01_germline_sequence(vgene_only=vgene_only) glvrc01_name = glvrc01.id import re regex = re.compile("[a-zA-Z]") # identify VRC01-class mutations for s in input_seqs: alignment_seqs = [s] + vrc01_seqs + [glvrc01] aln = muscle(alignment_seqs) aln_seq = [seq for seq in aln if seq.id == s.id][0] aln_gl = [seq for seq in aln if seq.id == glvrc01_name][0] aln_vrc01s = [seq for seq in aln if seq.id in vrc01_names] # Strip '-' off the front of strings based on input string # match = re.search(regex,str(aln_seq.seq)) index = [m.start() for m in re.finditer(regex, str(aln_seq.seq))] # Logic if no match is found... if (len(index) > 1): aln_seq.seq = aln_seq.seq[index[0]:(index[-1]+1)] aln_gl.seq = aln_gl.seq[index[0]:(index[-1]+1)] for p in aln_vrc01s: p.seq = p.seq[index[0]:(index[-1]+1)] # Count mutations _total = sum([_s != g for _s, g in zip(str(aln_seq.seq), str(aln_gl.seq)) if g != '-']) total.append(_total) all_shared = {} for vrc01 in aln_vrc01s: _shared = [] for q, g, v in zip(str(aln_seq.seq), str(aln_gl.seq), str(vrc01.seq)): if g == '-' and v == '-': _shared.append(False) elif q == v and q != g: _shared.append(True) else: _shared.append(False) all_shared[vrc01.id] = _shared any_shared = 0 for pos in zip(*all_shared.values()): if any(pos): any_shared += 1 shared.append(any_shared) # print("Seq: %20s, Total: %2d, Shared: %2d" % (str(aln_seq.id), _total, any_shared)) # print("Seq: "+str(aln_seq.id)+" Total: "+str(_total)+", Shared: "+str(any_shared)) return shared, total
def vrc01_class_mutation_positions(seqs, vgene_only=True): data = [] input_seqs = [sequence.Sequence([s['seq_id'], s['vdj_aa']]) for s in seqs] input_names = [s.id for s in input_seqs] # get VRC01-class sequences if (not expanded): hiv_seqs = get_vrc01_class_sequences() else: hiv_seqs = get_expanded_vrc01_class_sequences() all_hiv_names = [s.id for s in hiv_seqs] # MSA seqs_for_alignment = input_seqs + hiv_seqs seqs_for_alignment.append( get_vrc01_germline_sequence(vgene_only=vgene_only)) aln = muscle(seqs_for_alignment) aln_seqs = [seq for seq in aln if seq.id in input_names] aln_gl = [seq for seq in aln if seq.id == 'glVRC01'][0] aln_mins = [seq for seq in aln if seq.id in ['minVRC01', 'min12A21']] aln_hiv = [seq for seq in aln if seq.id in all_hiv_names] for seq in aln_seqs: seq_data = [] for i, (s, g) in enumerate(zip(str(seq.seq), str(aln_gl.seq))): # if g == '-' and s == '-': if g == '-': continue min_residues = [seq[i] for seq in aln_mins] vrc01_residues = [seq[i] for seq in aln_hiv] if s == '-': seq_data.append(0) elif s == g: seq_data.append(0) elif s != g and s in min_residues: seq_data.append(2) elif s != g and s in vrc01_residues: seq_data.append(3) elif s != g and s not in vrc01_residues: seq_data.append(1) else: seq_data.append(0) data.append(np.asarray(seq_data)) return np.asarray(data)
def identifyvrc01muts(input_seqs, vrc01_seqs, glvrc01, genecall): import re regex = re.compile("[a-zA-Z]") shared = [] total = [] glvrc01_name = glvrc01.id vrc01_names = [s.id for s in vrc01_seqs] for s in input_seqs: alignment_seqs = [s] + vrc01_seqs + [glvrc01] aln = muscle(alignment_seqs) aln_seq = [seq for seq in aln if seq.id == s.id][0] aln_gl = [seq for seq in aln if seq.id == glvrc01_name][0] aln_vrc01s = [seq for seq in aln if seq.id in vrc01_names] # Strip '-' off the front of strings based on input string # match = re.search(regex,str(aln_seq.seq)) index = [m.start() for m in re.finditer(regex, str(aln_seq.seq))] # Logic if no match is found... if (len(index) > 1): aln_seq.seq = aln_seq.seq[index[0]:(index[-1] + 1)] aln_gl.seq = aln_gl.seq[index[0]:(index[-1] + 1)] for p in aln_vrc01s: p.seq = p.seq[index[0]:(index[-1] + 1)] # If there is a VH12 Mask. Generate a mask string 1 for include and 0 otherwise chainmaskstring = "x" * len(aln_gl.seq) usemask = False if genecall == VH12 and vh12mask: chainmaskstring = get_mask_string(aln_gl.seq, masks[VH12]) usemask = True elif genecall == VK320 and vk320mask: chainmaskstring = get_mask_string(aln_gl.seq, masks[VK320]) usemask = True elif genecall == VK133 and vk133mask: chainmaskstring = get_mask_string(aln_gl.seq, masks[VK133]) usemask = True elif genecall == VL214 and vl214mask: chainmaskstring = get_mask_string(aln_gl.seq, masks[VL214]) usemask = True # Count mutations _total = sum([ _s != g for _s, g in zip(str(aln_seq.seq), str(aln_gl.seq)) if g != '-' ]) total.append(_total) all_shared = {} for vrc01 in aln_vrc01s: _shared = [] for q, g, v, cmask in zip(str(aln_seq.seq), str(aln_gl.seq), str(vrc01.seq), str(chainmaskstring)): if g == '-' and v == '-': _shared.append(False) elif q == v and q != g: if not usemask: _shared.append(True) else: if cmask == "1": _shared.append(True) else: _shared.append(False) else: _shared.append(False) all_shared[vrc01.id] = _shared any_shared = 0 for pos in zip(*all_shared.values()): if any(pos): any_shared += 1 shared.append(any_shared) # print("Seq: %20s, Total: %2d, Shared: %2d" % (str(aln_seq.id), _total, any_shared)) # print("Seq: "+str(aln_seq.id)+" Total: "+str(_total)+", Shared: "+str(any_shared)) return shared, total