def test_get_duplex_posits_C():
    """
    FIRST and second position test
    """
    ref_seq = "TCTTTCTTCT"

    d = get_duplex_posits_with_ref_nuc_at_first_pos(ref_seq, "C")
    assert d["start_pos"][0] == 1
    assert d["stop_pos"][0] == 2
    assert d["ref_duplex"][0] == "CT"
    assert d["start_pos"][1] == 5
    assert d["stop_pos"][1] == 6
    assert d["ref_duplex"][1] == "CT"
    assert d["start_pos"][2] == 8
    assert d["stop_pos"][2] == 9
    assert d["ref_duplex"][2] == "CT"

    d = get_duplex_posits_with_ref_nuc_at_second_pos(ref_seq, "C")
    assert d["start_pos"][0] == 0
    assert d["stop_pos"][0] == 1
    assert d["ref_duplex"][0] == "TC"
    assert d["start_pos"][1] == 4
    assert d["stop_pos"][1] == 5
    assert d["ref_duplex"][1] == "TC"
    assert d["start_pos"][2] == 7
    assert d["stop_pos"][2] == 8
    assert d["ref_duplex"][2] == "TC"
def test_get_duplex_posits_at_second_pos_at_the_start_and_at_the_end():
    """
    """
    ref_seq = "TCTTTTTTTC"

    d = get_duplex_posits_with_ref_nuc_at_second_pos(ref_seq, "C")
    assert d["start_pos"][0] == 0
    assert d["stop_pos"][0] == 1
    assert d["ref_duplex"][0] == "TC"
    assert d["start_pos"][1] == 8
    assert d["stop_pos"][1] == 9
    assert d["ref_duplex"][1] == "TC"

    # nucs to find is at the start without duplexes: like false positives
    ref_seq = "CTTTTTTTT"

    d = get_duplex_posits_with_ref_nuc_at_second_pos(ref_seq, "C")
    assert d["start_pos"] == []
    assert d["stop_pos"] == []
    assert d["ref_duplex"] == []
def test_get_duplex_posits_at_second_pos_with_gaps():
    """
    """
    ref_seq = "T-CTTT---CTTT--C"

    d = get_duplex_posits_with_ref_nuc_at_second_pos(ref_seq, "C")
    assert d["start_pos"][0] == 0
    assert d["stop_pos"][0] == 2
    assert d["ref_duplex"][0] == "TC"
    assert d["start_pos"][1] == 5
    assert d["stop_pos"][1] == 9
    assert d["ref_duplex"][1] == "TC"
    assert d["start_pos"][2] == 12
    assert d["stop_pos"][2] == 15
    assert d["ref_duplex"][2] == "TC"
Esempio n. 4
0
def test_create_pivot_df():
    print(os.listdir("./input_data"))

    input_files = get_input_files_names("./input_data")
    #print("input_files: ", input_files)

    ref_seq = get_ref("data_C_1.fasta")
    input_file = "data_C_1.fasta"
    #print(ref_seq)

    nuc = "C"
    snp_type = ["A", "T", "G"]

    duplex_posits_at_first = get_duplex_posits_with_ref_nuc_at_first_pos(
        ref_seq, nuc)
    duplex_posits_at_second = get_duplex_posits_with_ref_nuc_at_second_pos(
        ref_seq, nuc)

    df_first_pos = create_df(duplex_posits_at_first, snp_type)
    df_second_pos = create_df(duplex_posits_at_second, snp_type)
    df_snp_first_pos, coverage_first_pos, record_id_first_pos = count_snp_with_ref_nuc_at_first_pos(
        input_file, df_first_pos, nuc)
    df_snp_second_pos, coverage_second_pos, record_id_second_pos = count_snp_with_ref_nuc_at_second_pos(
        input_file, df_second_pos, nuc)

    pivot_df_C_first_pos = create_pivot_df(df_snp_first_pos)
    pivot_df_C_second_pos = create_pivot_df(df_snp_second_pos)

    print(pivot_df_C_first_pos)
    print(pivot_df_C_second_pos)

    assert pivot_df_C_first_pos.loc["CT", "A"] == 4
    assert pivot_df_C_first_pos.loc["CT", "G"] == 2
    assert pivot_df_C_first_pos.loc["CT", "T"] == 3

    assert pivot_df_C_second_pos.loc["TC", "A"] == 4
    assert pivot_df_C_second_pos.loc["TC", "G"] == 2
    assert pivot_df_C_second_pos.loc["TC", "T"] == 3
Esempio n. 5
0
def test_snp_df_nuc_A():
    #print(os.listdir("./input_data"))

    input_files = get_input_files_names("./input_data")
    #print("input_files: ", input_files)

    ref_seq = get_ref("data_A.fasta")
    input_file = "data_A.fasta"
    #print(ref_seq)

    nuc = "A"
    snp_type = ["T", "G", "C"]

    duplex_posits_at_first = get_duplex_posits_with_ref_nuc_at_first_pos(
        ref_seq, nuc)
    duplex_posits_at_second = get_duplex_posits_with_ref_nuc_at_second_pos(
        ref_seq, nuc)

    df_first_pos = create_df(duplex_posits_at_first, snp_type)
    df_second_pos = create_df(duplex_posits_at_second, snp_type)
    df_snp_first_pos, coverage_first_pos, record_id_first_pos = count_snp_with_ref_nuc_at_first_pos(
        input_file, df_first_pos, nuc)
    df_snp_second_pos, coverage_second_pos, record_id_second_pos = count_snp_with_ref_nuc_at_second_pos(
        input_file, df_second_pos, nuc)

    # uncomment to have a look at dataframes
    #print(df_snp_first_pos)
    #print(df_snp_second_pos)
    #########################################################
    # at first position
    assert df_snp_first_pos.loc[0, "start_pos"] == 2
    assert df_snp_first_pos.loc[0, "stop_pos"] == 3
    assert df_snp_first_pos.loc[0, "ref_duplex"] == "AT"
    assert df_snp_first_pos.loc[0, "G"] == 0
    assert df_snp_first_pos.loc[0, "C"] == 0
    assert df_snp_first_pos.loc[0, "T"] == 1

    assert df_snp_first_pos.loc[1, "start_pos"] == 5
    assert df_snp_first_pos.loc[1, "stop_pos"] == 6
    assert df_snp_first_pos.loc[1, "ref_duplex"] == "AT"
    assert df_snp_first_pos.loc[1, "G"] == 0
    assert df_snp_first_pos.loc[1, "C"] == 2
    assert df_snp_first_pos.loc[1, "T"] == 0

    assert df_snp_first_pos.loc[2, "start_pos"] == 8
    assert df_snp_first_pos.loc[2, "stop_pos"] == 9
    assert df_snp_first_pos.loc[2, "ref_duplex"] == "AT"
    assert df_snp_first_pos.loc[2, "G"] == 3
    assert df_snp_first_pos.loc[2, "C"] == 0
    assert df_snp_first_pos.loc[2, "T"] == 0

    assert df_snp_first_pos.loc[3, "start_pos"] == 10
    assert df_snp_first_pos.loc[3, "stop_pos"] == 11
    assert df_snp_first_pos.loc[3, "ref_duplex"] == "AT"
    assert df_snp_first_pos.loc[3, "G"] == 0
    assert df_snp_first_pos.loc[3, "C"] == 0
    assert df_snp_first_pos.loc[3, "T"] == 0

    ######################################################
    # at second position
    assert df_snp_second_pos.loc[0, "start_pos"] == 1
    assert df_snp_second_pos.loc[0, "stop_pos"] == 2
    assert df_snp_second_pos.loc[0, "ref_duplex"] == "TA"
    assert df_snp_second_pos.loc[0, "G"] == 0
    assert df_snp_second_pos.loc[0, "C"] == 0
    assert df_snp_second_pos.loc[0, "T"] == 1

    assert df_snp_second_pos.loc[1, "start_pos"] == 4
    assert df_snp_second_pos.loc[1, "stop_pos"] == 5
    assert df_snp_second_pos.loc[1, "ref_duplex"] == "TA"
    assert df_snp_second_pos.loc[1, "G"] == 0
    assert df_snp_second_pos.loc[1, "C"] == 2
    assert df_snp_second_pos.loc[1, "T"] == 0

    assert df_snp_second_pos.loc[2, "start_pos"] == 7
    assert df_snp_second_pos.loc[2, "stop_pos"] == 8
    assert df_snp_second_pos.loc[2, "ref_duplex"] == "TA"
    assert df_snp_second_pos.loc[2, "G"] == 3
    assert df_snp_second_pos.loc[2, "C"] == 0
    assert df_snp_second_pos.loc[2, "T"] == 0

    assert df_snp_second_pos.loc[3, "start_pos"] == 9
    assert df_snp_second_pos.loc[3, "stop_pos"] == 10
    assert df_snp_second_pos.loc[3, "ref_duplex"] == "TA"
    assert df_snp_second_pos.loc[3, "G"] == 0
    assert df_snp_second_pos.loc[3, "C"] == 0
    assert df_snp_second_pos.loc[3, "T"] == 0