def test_mutinf(do_test_cond_mut_info=False): one_arr = np.array([1, 2, 3, 3, 2, 1, 2, 2, 2, 1]) two_arr = np.array([1, 1, 1, 2, 2, 2, 3, 3, 3, 1]) three_arr = np.array([1, 2, 2, 2, 3, 2, 3, 1, 2, 1]) four_arr = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2]) # the current implementation of MI only works for arrays where bins start from 0 and go incrementally one_arr = one_arr - 1 two_arr = two_arr - 1 three_arr = three_arr - 1 four_arr = four_arr - 1 mi_test = MI.mut_info(one_arr, two_arr, x_bins=3, y_bins=3) mi_expected = 0.28418101912817351 assert (np.isclose(mi_test, mi_expected, atol=1e-16)) if do_test_cond_mut_info: cmi_test = MI.cond_mut_info(one_arr, two_arr, one_arr + two_arr, x_bins=3, y_bins=3, z_bins=4) cmi_expected = 0.50219293007150134 # print(cmi_test, cmi_expected) assert (np.isclose(cmi_test, cmi_expected, atol=1e-16)) cmi_test_2 = MI.cond_mut_info(one_arr, two_arr, three_arr, x_bins=3, y_bins=3, z_bins=4) cmi_expected_2 = 0.44115546225753777 # print(cmi_test_2, cmi_expected_2) assert (np.isclose(cmi_test_2, cmi_expected_2, atol=1e-16)) cmi_test_3 = MI.cond_mut_info(one_arr, two_arr, four_arr, x_bins=3, y_bins=3, z_bins=4) cmi_expected_3 = 0.28418101912817351 # print(cmi_test_3, cmi_expected_3) assert (np.isclose(cmi_test_3, cmi_expected_3, atol=1e-16)) # an example from here: https://nlp.stanford.edu/IR-book/html/htmledition/mutual-information-1.html#mifeatsel2 ut = np.repeat([0, 1, 0, 1], [774106, 27625, 141, 49]) cc = np.repeat([0, 0, 1, 1], [774106, 27625, 141, 49]) mi_test_base_2 = MI.mut_info(ut, cc, base=2, x_bins=2, y_bins=2) mi_expected_base_2 = 0.0001105 #print(mi_test_base_2, mi_expected_base_2) assert (np.isclose(mi_test_base_2, mi_expected_base_2, atol=1e-6))
def time_entropies(): vect_to_discr_10k = np.random.normal(size=40000) discr_vect = MI.discretize(vect_to_discr_10k, bins=15) e1 = entropy(discr_vect, how="scipy") e2 = entropy(discr_vect, how="math") e3 = entropy(discr_vect, how="numpy") e4 = eta(discr_vect) assert (np.isclose(e1, e2, atol=1e-16)) assert (np.isclose(e1, e3, atol=1e-16)) assert (np.isclose(e1, e4, atol=1e-16)) # print(e1, e2, e3) time_entr_1 = timeit.timeit(lambda: entropy(discr_vect, how="scipy"), number=500) time_entr_2 = timeit.timeit(lambda: entropy(discr_vect, how="math"), number=500) time_entr_3 = timeit.timeit(lambda: entropy(discr_vect, how="numpy"), number=500) time_entr_4 = timeit.timeit(lambda: eta(discr_vect), number=100) print("Entropy calculation with scipy takes : ", time_entr_1) print("Entropy calculation with math takes : ", time_entr_2) print("Entropy calculation with numpy takes : ", time_entr_3) print("Entropy calculation with Counter takes : ", time_entr_4)
def time_calculate_MI_profiles(calculate_with_numba): test_batch_folder = '/Users/student/Documents/hani/programs/pyteiser/data/test_1_batch_snrnpa1' seeds_filename = os.path.join(test_batch_folder, 'seeds_4-7_4-9_4-6_14-20_30k_1.bin') profiles_filename = os.path.join( test_batch_folder, 'snrnpa_profiles_4-7_4-9_4-6_14-20_30k_1.bin') exp_mask_filename = "/Users/student/Documents/hani/programs/pyteiser/data/mask_files/SNRNPA1_PSI_mask.bin" nbins = 15 min_occurences = 5 decompressed_profiles_array, index_array, values_array = IO.unpack_profiles_and_mask( profiles_filename, exp_mask_filename, do_print=True) discr_exp_profile = MI.discretize_exp_profile(index_array, values_array, nbins) value, counts = np.unique(discr_exp_profile, return_counts=True, axis=0) print(counts) MI_values_array = calculate_MI_profiles.calculate_MI_for_seeds( decompressed_profiles_array, index_array, discr_exp_profile, min_occurences, calculate_with_numba, do_print=True)
def run_test_elongated_seed(seqs_of_interest, discr_exp_profile, nbins, N, do_print=True): elong_seed = create_one_seed(do_print) current_profile, time_spent = matchmaker.calculate_profile_one_motif( elong_seed, seqs_of_interest, is_degenerate=True) matching_sequences = [ seqs_of_interest[x] for x in range(current_profile.values.shape[0]) if current_profile.values[x] ] curr_mi = MI.mut_info(current_profile.values, discr_exp_profile, x_bins=2, y_bins=nbins) if do_print: print(curr_mi) first_N_matching_sequences = matching_sequences[0:N] counter = 0 for seq in first_N_matching_sequences: curr_matching_indices = matchmaker.find_all_motif_instances( elong_seed, seq, is_degenerate=True) for match_index in curr_matching_indices: counter += 1 match_sequence = structures.w_sequence(elong_seed.linear_length) match_sequence.nts = seq.nts[match_index:match_index + elong_seed.linear_length] match_string = match_sequence.print(return_string=True) print("Match %d: %s" % (counter, match_string))
def main(): args = handler() n_seqs_list = read_sequences(args.rna_bin_file) index_array, values_array = IO.unpack_mask_file(args.exp_mask_file) discr_exp_profile = MI.discretize_exp_profile(index_array, values_array, nbins = args.nbins) seqs_of_interest = [n_seqs_list[x] for x in range(index_array.shape[0]) if index_array[x]] test_elongated_seed(seqs_of_interest, discr_exp_profile, args.nbins, args.number_example_matches_to_print)
def test_discret_eq_freq(): vect_to_discr_9 = np.array([0.5, 5.1, 5.2, 4.8, 9.9, 0.1, 9.7, 0.2, 10.3], dtype=np.float32) discr_expected_9_result = np.array([0, 1, 1, 1, 2, 0, 2, 0, 2], dtype=np.uint16) # 3 bins vect_to_discr_9_result = MI.discretize(vect_to_discr_9, bins=3) assert (np.array_equal(vect_to_discr_9_result, discr_expected_9_result)) vect_to_discr_30 = np.array([ -0.490, 1.761, -1.400, 0.411, -0.244, 0.177, 0.091, -0.349, -0.554, 1.339, -0.094, 0.757, -0.469, -0.973, -1.192, -0.831, -0.618, 0.335, 0.020, 0.406, 0.301, -1.721, -0.678, -0.917, 1.498, -1.084, -0.152, -0.915, 0.094, -0.499 ], dtype=np.float32) discr_expected_30_result = np.array([ 2, 6, 0, 5, 3, 4, 4, 3, 2, 6, 3, 6, 3, 0, 0, 1, 2, 5, 4, 5, 5, 0, 1, 1, 6, 0, 3, 1, 4, 2 ], dtype=np.uint16) # 7 bins vect_to_discr_30_result = MI.discretize(vect_to_discr_30, bins=7) assert (np.array_equal(vect_to_discr_30_result, discr_expected_30_result))
def time_discretization(): vect_to_discr_10k = np.random.normal(size=10000) time_discretization = timeit.timeit( lambda: MI.discret_eq_freq(vect_to_discr_10k, nbins=17), number=1000) print("Discretization takes: ", time_discretization)