Esempio n. 1
0
def test_mutinf(do_test_cond_mut_info=False):
    one_arr = np.array([1, 2, 3, 3, 2, 1, 2, 2, 2, 1])
    two_arr = np.array([1, 1, 1, 2, 2, 2, 3, 3, 3, 1])
    three_arr = np.array([1, 2, 2, 2, 3, 2, 3, 1, 2, 1])
    four_arr = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

    # the current implementation of MI only works for arrays where bins start from 0 and go incrementally
    one_arr = one_arr - 1
    two_arr = two_arr - 1
    three_arr = three_arr - 1
    four_arr = four_arr - 1

    mi_test = MI.mut_info(one_arr, two_arr, x_bins=3, y_bins=3)
    mi_expected = 0.28418101912817351
    assert (np.isclose(mi_test, mi_expected, atol=1e-16))

    if do_test_cond_mut_info:
        cmi_test = MI.cond_mut_info(one_arr,
                                    two_arr,
                                    one_arr + two_arr,
                                    x_bins=3,
                                    y_bins=3,
                                    z_bins=4)
        cmi_expected = 0.50219293007150134
        # print(cmi_test, cmi_expected)
        assert (np.isclose(cmi_test, cmi_expected, atol=1e-16))

        cmi_test_2 = MI.cond_mut_info(one_arr,
                                      two_arr,
                                      three_arr,
                                      x_bins=3,
                                      y_bins=3,
                                      z_bins=4)
        cmi_expected_2 = 0.44115546225753777
        # print(cmi_test_2, cmi_expected_2)
        assert (np.isclose(cmi_test_2, cmi_expected_2, atol=1e-16))

        cmi_test_3 = MI.cond_mut_info(one_arr,
                                      two_arr,
                                      four_arr,
                                      x_bins=3,
                                      y_bins=3,
                                      z_bins=4)
        cmi_expected_3 = 0.28418101912817351
        # print(cmi_test_3, cmi_expected_3)
        assert (np.isclose(cmi_test_3, cmi_expected_3, atol=1e-16))

    # an example from here: https://nlp.stanford.edu/IR-book/html/htmledition/mutual-information-1.html#mifeatsel2
    ut = np.repeat([0, 1, 0, 1], [774106, 27625, 141, 49])
    cc = np.repeat([0, 0, 1, 1], [774106, 27625, 141, 49])
    mi_test_base_2 = MI.mut_info(ut, cc, base=2, x_bins=2, y_bins=2)
    mi_expected_base_2 = 0.0001105
    #print(mi_test_base_2, mi_expected_base_2)
    assert (np.isclose(mi_test_base_2, mi_expected_base_2, atol=1e-6))
Esempio n. 2
0
def time_entropies():
    vect_to_discr_10k = np.random.normal(size=40000)
    discr_vect = MI.discretize(vect_to_discr_10k, bins=15)

    e1 = entropy(discr_vect, how="scipy")
    e2 = entropy(discr_vect, how="math")
    e3 = entropy(discr_vect, how="numpy")
    e4 = eta(discr_vect)

    assert (np.isclose(e1, e2, atol=1e-16))
    assert (np.isclose(e1, e3, atol=1e-16))
    assert (np.isclose(e1, e4, atol=1e-16))
    # print(e1, e2, e3)

    time_entr_1 = timeit.timeit(lambda: entropy(discr_vect, how="scipy"),
                                number=500)
    time_entr_2 = timeit.timeit(lambda: entropy(discr_vect, how="math"),
                                number=500)
    time_entr_3 = timeit.timeit(lambda: entropy(discr_vect, how="numpy"),
                                number=500)
    time_entr_4 = timeit.timeit(lambda: eta(discr_vect), number=100)
    print("Entropy calculation with scipy takes : ", time_entr_1)
    print("Entropy calculation with math takes : ", time_entr_2)
    print("Entropy calculation with numpy takes : ", time_entr_3)
    print("Entropy calculation with Counter takes : ", time_entr_4)
Esempio n. 3
0
def time_calculate_MI_profiles(calculate_with_numba):
    test_batch_folder = '/Users/student/Documents/hani/programs/pyteiser/data/test_1_batch_snrnpa1'
    seeds_filename = os.path.join(test_batch_folder,
                                  'seeds_4-7_4-9_4-6_14-20_30k_1.bin')
    profiles_filename = os.path.join(
        test_batch_folder, 'snrnpa_profiles_4-7_4-9_4-6_14-20_30k_1.bin')
    exp_mask_filename = "/Users/student/Documents/hani/programs/pyteiser/data/mask_files/SNRNPA1_PSI_mask.bin"
    nbins = 15
    min_occurences = 5

    decompressed_profiles_array, index_array, values_array = IO.unpack_profiles_and_mask(
        profiles_filename, exp_mask_filename, do_print=True)

    discr_exp_profile = MI.discretize_exp_profile(index_array, values_array,
                                                  nbins)

    value, counts = np.unique(discr_exp_profile, return_counts=True, axis=0)
    print(counts)

    MI_values_array = calculate_MI_profiles.calculate_MI_for_seeds(
        decompressed_profiles_array,
        index_array,
        discr_exp_profile,
        min_occurences,
        calculate_with_numba,
        do_print=True)
Esempio n. 4
0
def run_test_elongated_seed(seqs_of_interest,
                            discr_exp_profile,
                            nbins,
                            N,
                            do_print=True):
    elong_seed = create_one_seed(do_print)
    current_profile, time_spent = matchmaker.calculate_profile_one_motif(
        elong_seed, seqs_of_interest, is_degenerate=True)
    matching_sequences = [
        seqs_of_interest[x] for x in range(current_profile.values.shape[0])
        if current_profile.values[x]
    ]
    curr_mi = MI.mut_info(current_profile.values,
                          discr_exp_profile,
                          x_bins=2,
                          y_bins=nbins)
    if do_print:
        print(curr_mi)

    first_N_matching_sequences = matching_sequences[0:N]

    counter = 0

    for seq in first_N_matching_sequences:
        curr_matching_indices = matchmaker.find_all_motif_instances(
            elong_seed, seq, is_degenerate=True)
        for match_index in curr_matching_indices:
            counter += 1
            match_sequence = structures.w_sequence(elong_seed.linear_length)
            match_sequence.nts = seq.nts[match_index:match_index +
                                         elong_seed.linear_length]
            match_string = match_sequence.print(return_string=True)
            print("Match %d: %s" % (counter, match_string))
def main():
    args = handler()

    n_seqs_list = read_sequences(args.rna_bin_file)
    index_array, values_array = IO.unpack_mask_file(args.exp_mask_file)
    discr_exp_profile = MI.discretize_exp_profile(index_array, values_array, nbins = args.nbins)
    seqs_of_interest = [n_seqs_list[x] for x in range(index_array.shape[0]) if index_array[x]]

    test_elongated_seed(seqs_of_interest, discr_exp_profile, args.nbins, args.number_example_matches_to_print)
Esempio n. 6
0
def test_discret_eq_freq():

    vect_to_discr_9 = np.array([0.5, 5.1, 5.2, 4.8, 9.9, 0.1, 9.7, 0.2, 10.3],
                               dtype=np.float32)
    discr_expected_9_result = np.array([0, 1, 1, 1, 2, 0, 2, 0, 2],
                                       dtype=np.uint16)  # 3 bins
    vect_to_discr_9_result = MI.discretize(vect_to_discr_9, bins=3)
    assert (np.array_equal(vect_to_discr_9_result, discr_expected_9_result))

    vect_to_discr_30 = np.array([
        -0.490, 1.761, -1.400, 0.411, -0.244, 0.177, 0.091, -0.349, -0.554,
        1.339, -0.094, 0.757, -0.469, -0.973, -1.192, -0.831, -0.618, 0.335,
        0.020, 0.406, 0.301, -1.721, -0.678, -0.917, 1.498, -1.084, -0.152,
        -0.915, 0.094, -0.499
    ],
                                dtype=np.float32)
    discr_expected_30_result = np.array([
        2, 6, 0, 5, 3, 4, 4, 3, 2, 6, 3, 6, 3, 0, 0, 1, 2, 5, 4, 5, 5, 0, 1, 1,
        6, 0, 3, 1, 4, 2
    ],
                                        dtype=np.uint16)  # 7 bins
    vect_to_discr_30_result = MI.discretize(vect_to_discr_30, bins=7)
    assert (np.array_equal(vect_to_discr_30_result, discr_expected_30_result))
Esempio n. 7
0
def time_discretization():
    vect_to_discr_10k = np.random.normal(size=10000)
    time_discretization = timeit.timeit(
        lambda: MI.discret_eq_freq(vect_to_discr_10k, nbins=17), number=1000)
    print("Discretization takes: ", time_discretization)