Ejemplo n.º 1
0
def vrc01_class_mutation_count(seqs, vgene_only=True):
    input_seqs = [sequence.Sequence([s['seq_id'], s['vdj_aa']]) for s in seqs]

    shared = []
    total = []

    # get VRC01-class sequences
    vrc01_seqs = get_vrc01_class_sequences(vgene_only=vgene_only)
    vrc01_names = [s.id for s in vrc01_seqs]

    # get glVRC01 sequence
    glvrc01 = get_vrc01_germline_sequence(vgene_only=vgene_only)
    glvrc01_name = glvrc01.id

    import re
    regex = re.compile("[a-zA-Z]")

    # identify VRC01-class mutations
    for s in input_seqs:
        alignment_seqs = [s] + vrc01_seqs + [glvrc01]
        aln = muscle(alignment_seqs)
        aln_seq = [seq for seq in aln if seq.id == s.id][0]

        aln_gl = [seq for seq in aln if seq.id == glvrc01_name][0]
        aln_vrc01s = [seq for seq in aln if seq.id in vrc01_names]
        # Strip '-' off the front of strings based on input string
        # match = re.search(regex,str(aln_seq.seq))
        index = [m.start() for m in re.finditer(regex, str(aln_seq.seq))]
        # Logic if no match is found...
        if (len(index) > 1):
            aln_seq.seq = aln_seq.seq[index[0]:(index[-1]+1)]
            aln_gl.seq = aln_gl.seq[index[0]:(index[-1]+1)]
            for p in aln_vrc01s:
                p.seq = p.seq[index[0]:(index[-1]+1)]

        # Count mutations
        _total = sum([_s != g for _s, g in zip(str(aln_seq.seq), str(aln_gl.seq)) if g != '-'])

        total.append(_total)
        all_shared = {}
        for vrc01 in aln_vrc01s:
            _shared = []
            for q, g, v in zip(str(aln_seq.seq), str(aln_gl.seq), str(vrc01.seq)):
                if g == '-' and v == '-':
                    _shared.append(False)
                elif q == v and q != g:
                    _shared.append(True)
                else:
                    _shared.append(False)
            all_shared[vrc01.id] = _shared
        any_shared = 0
        for pos in zip(*all_shared.values()):
            if any(pos):
                any_shared += 1
        shared.append(any_shared)
        # print("Seq: %20s, Total: %2d, Shared: %2d" % (str(aln_seq.id), _total, any_shared))
        # print("Seq: "+str(aln_seq.id)+" Total: "+str(_total)+", Shared: "+str(any_shared))
    return shared, total
Ejemplo n.º 2
0
def vrc01_class_mutation_positions(seqs, vgene_only=True):
    data = []
    input_seqs = [sequence.Sequence([s['seq_id'], s['vdj_aa']]) for s in seqs]
    input_names = [s.id for s in input_seqs]
    # get VRC01-class sequences
    if (not expanded):
        hiv_seqs = get_vrc01_class_sequences()
    else:
        hiv_seqs = get_expanded_vrc01_class_sequences()

    all_hiv_names = [s.id for s in hiv_seqs]
    # MSA
    seqs_for_alignment = input_seqs + hiv_seqs
    seqs_for_alignment.append(
        get_vrc01_germline_sequence(vgene_only=vgene_only))
    aln = muscle(seqs_for_alignment)
    aln_seqs = [seq for seq in aln if seq.id in input_names]
    aln_gl = [seq for seq in aln if seq.id == 'glVRC01'][0]
    aln_mins = [seq for seq in aln if seq.id in ['minVRC01', 'min12A21']]
    aln_hiv = [seq for seq in aln if seq.id in all_hiv_names]
    for seq in aln_seqs:
        seq_data = []
        for i, (s, g) in enumerate(zip(str(seq.seq), str(aln_gl.seq))):
            # if g == '-' and s == '-':
            if g == '-':
                continue
            min_residues = [seq[i] for seq in aln_mins]
            vrc01_residues = [seq[i] for seq in aln_hiv]
            if s == '-':
                seq_data.append(0)
            elif s == g:
                seq_data.append(0)
            elif s != g and s in min_residues:
                seq_data.append(2)
            elif s != g and s in vrc01_residues:
                seq_data.append(3)
            elif s != g and s not in vrc01_residues:
                seq_data.append(1)
            else:
                seq_data.append(0)
        data.append(np.asarray(seq_data))
    return np.asarray(data)
Ejemplo n.º 3
0
def identifyvrc01muts(input_seqs, vrc01_seqs, glvrc01, genecall):
    import re
    regex = re.compile("[a-zA-Z]")

    shared = []
    total = []
    glvrc01_name = glvrc01.id
    vrc01_names = [s.id for s in vrc01_seqs]

    for s in input_seqs:
        alignment_seqs = [s] + vrc01_seqs + [glvrc01]
        aln = muscle(alignment_seqs)
        aln_seq = [seq for seq in aln if seq.id == s.id][0]

        aln_gl = [seq for seq in aln if seq.id == glvrc01_name][0]
        aln_vrc01s = [seq for seq in aln if seq.id in vrc01_names]
        # Strip '-' off the front of strings based on input string
        # match = re.search(regex,str(aln_seq.seq))
        index = [m.start() for m in re.finditer(regex, str(aln_seq.seq))]
        # Logic if no match is found...
        if (len(index) > 1):
            aln_seq.seq = aln_seq.seq[index[0]:(index[-1] + 1)]
            aln_gl.seq = aln_gl.seq[index[0]:(index[-1] + 1)]
            for p in aln_vrc01s:
                p.seq = p.seq[index[0]:(index[-1] + 1)]

        # If there is a VH12 Mask. Generate a mask string 1 for include and 0 otherwise
        chainmaskstring = "x" * len(aln_gl.seq)
        usemask = False
        if genecall == VH12 and vh12mask:
            chainmaskstring = get_mask_string(aln_gl.seq, masks[VH12])
            usemask = True
        elif genecall == VK320 and vk320mask:
            chainmaskstring = get_mask_string(aln_gl.seq, masks[VK320])
            usemask = True
        elif genecall == VK133 and vk133mask:
            chainmaskstring = get_mask_string(aln_gl.seq, masks[VK133])
            usemask = True
        elif genecall == VL214 and vl214mask:
            chainmaskstring = get_mask_string(aln_gl.seq, masks[VL214])
            usemask = True

        # Count mutations
        _total = sum([
            _s != g for _s, g in zip(str(aln_seq.seq), str(aln_gl.seq))
            if g != '-'
        ])

        total.append(_total)
        all_shared = {}
        for vrc01 in aln_vrc01s:
            _shared = []
            for q, g, v, cmask in zip(str(aln_seq.seq), str(aln_gl.seq),
                                      str(vrc01.seq), str(chainmaskstring)):
                if g == '-' and v == '-':
                    _shared.append(False)
                elif q == v and q != g:
                    if not usemask:
                        _shared.append(True)
                    else:
                        if cmask == "1":
                            _shared.append(True)
                        else:
                            _shared.append(False)
                else:
                    _shared.append(False)
            all_shared[vrc01.id] = _shared
        any_shared = 0
        for pos in zip(*all_shared.values()):
            if any(pos):
                any_shared += 1
        shared.append(any_shared)
        # print("Seq: %20s, Total: %2d, Shared: %2d" % (str(aln_seq.id), _total, any_shared))
        # print("Seq: "+str(aln_seq.id)+" Total: "+str(_total)+", Shared: "+str(any_shared))
    return shared, total