Exemple #1
0
def set_fwr4_columns(record, database):
    j_call = record["j_call"]
    if not j_call or record["locus"] not in ALLOWED_LOCI:
        return

    cdr3_ref_end = database.j_cdr3_end(record["j_call"], record["locus"])
    cdr3_query_end = record["cdr3_end"]
    if cdr3_ref_end is None or not cdr3_query_end:
        return

    fwr4_nt = record["sequence"][cdr3_query_end : record["j_sequence_end"]]

    # This overwrites some existing columns
    record["fwr4_start"] = cdr3_query_end + 1
    record["fwr4_end"] = record["j_sequence_end"]
    record["fwr4"] = fwr4_nt
    record["fwr4_aa"] = nt_to_aa(fwr4_nt)

    # Compute FR4 mutation rate on nucleotide level
    germline = database.j[record["j_call"]][
        record["j_germline_start"] - 1 : record["j_germline_end"]
    ]
    dist = edit_distance(germline, fwr4_nt)
    record["FR4_SHM"] = 100.0 * dist / len(germline)

    # Compute FR4 amino acid mutation rate
    sequence_aa = record["fwr4_aa"]
    germline_aa = nt_to_aa(germline)
    dist = edit_distance(germline_aa, sequence_aa)
    record["J_aa_mut"] = 100.0 * dist / len(germline_aa)
Exemple #2
0
def assert_cdr3_detection(chain, s):
    for amino_acids, sequence in split(s):
        for offset in range(3):
            target = sequence[offset:]
            match = find_cdr3(target, chain)
            assert match is not None
            assert nt_to_aa(
                target[match[0]:match[1]]) == amino_acids, (chain, amino_acids,
                                                            offset)
Exemple #3
0
def set_aa_mut_columns(record, database):
    """
    Compute amino acid mutation rate for all regions on V and also for V
    itself as the sum of the regions (that is, excluding the CDR3)
    """
    total_length = 0
    total_dist = 0
    n_regions = 0
    for airr_col, region in (
        ("fwr1", "FR1"),
        ("cdr1", "CDR1"),
        ("fwr2", "FR2"),
        ("cdr2", "CDR2"),
        ("fwr3", "FR3"),
    ):
        record[region + "_aa_mut"] = None
        start = record[airr_col + "_start"]
        end = record[airr_col + "_end"]
        if start is None or end is None:
            continue
        sequence_aa = nt_to_aa(record["sequence"][start - 1 : end])
        germline_aa = database.v_regions_aa[record["v_call"]].get(region)
        if germline_aa is None:
            continue
        # Some FR1 alignments are reported with a frameshift by IgBLAST. By requiring that
        # reference and query lengths are identical, we can filter out these cases (and use
        # Hamming distance to get some speedup)
        if len(germline_aa) != len(sequence_aa):
            continue
        dist = hamming_distance(germline_aa, sequence_aa)
        mut_aa = dist / len(germline_aa)
        if mut_aa >= 0.8:
            # assume something went wrong
            continue
        total_dist += dist
        n_regions += 1
        total_length += len(germline_aa)
        record[region + "_aa_mut"] = 100.0 * mut_aa
    if n_regions == 5:
        record["V_aa_mut"] = 100.0 * total_dist / total_length
    else:
        record["V_aa_mut"] = None
Exemple #4
0
def set_cdr3_columns(record, database):
    if (
        not record["v_call"]
        or not record["j_call"]
        or record["locus"] not in ALLOWED_LOCI
    ):
        return

    # CDR3 start
    cdr3_ref_start = database.v_cdr3_start(
        record["v_call"], record["locus"]
    )
    if cdr3_ref_start is None:
        return
    cdr3_query_start = query_position(record, "v", reference_position=cdr3_ref_start)
    if cdr3_query_start is None:
        # Alignment is not long enough to cover CDR3 start position; try to rescue it
        # by assuming that the alignment would continue without indels.
        cdr3_query_start = record["v_sequence_end"] + (
            cdr3_ref_start - record["v_germline_end"]
        )

    # CDR3 end
    cdr3_ref_end = database.j_cdr3_end(record["j_call"], record["locus"])
    if cdr3_ref_end is None:
        return

    cdr3_query_end = query_position(record, "j", reference_position=cdr3_ref_end)
    if cdr3_query_end is None:
        return
    cdr3_nt = record["sequence"][cdr3_query_start:cdr3_query_end]

    record["cdr3_start"] = cdr3_query_start + 1
    record["cdr3_end"] = cdr3_query_end
    record["cdr3"] = cdr3_nt
    record["cdr3_aa"] = nt_to_aa(cdr3_nt)