def MI_get_pvalue_and_zscore(active_profile, discr_exp_profile, nbins, current_MI, n_permutations): shuffled_MI_values = np.zeros(n_permutations, dtype=np.float64) for i in range(n_permutations): shuffled_expr = np.random.permutation(discr_exp_profile) ith_MI = MI.mut_info(active_profile, shuffled_expr, x_bins=2, y_bins=nbins) shuffled_MI_values[i] = ith_MI shuffled_MI_values.sort() if current_MI < shuffled_MI_values[0]: # shortcut: if current MI is less than the minimal permuted MI, exit value_undiv = n_permutations else: # go from right to left while the shuffled score is higher than the real one j = n_permutations - 1 while (j >= 0) and (current_MI <= shuffled_MI_values[j]): j -= 1 value_undiv = n_permutations - j - 1 pvalue = value_undiv / float(n_permutations) z_score = (current_MI - np.mean(shuffled_MI_values)) / np.std(shuffled_MI_values) # print(shuffled_MI_values) # print(current_MI) return pvalue, z_score
def min_CI_normalized_test(counter, accepted_seeds_list, profiles_passed, discr_exp_profile, nbins, index_array, min_ratio, do_print=False): profile_full = profiles_passed[counter] profile_being_analyzed = profile_full[index_array] for i in range(len(accepted_seeds_list)): ith_accepted_profile_full = profiles_passed[accepted_seeds_list[i]] ith_accepted_profile = ith_accepted_profile_full[index_array] cond_inf = MI.cond_mut_info(profile_being_analyzed, discr_exp_profile, ith_accepted_profile, x_bins=2, y_bins=nbins, z_bins=2) mut_inf = MI.mut_info(profile_being_analyzed, ith_accepted_profile, x_bins=2, y_bins=2) if np.isclose(mut_inf, 0., atol=1e-16): mut_inf = 1e-16 ratio = cond_inf / mut_inf print("Comparing seed #%d to an existing seed #%d. The ratio is %.2f" % (counter, i, ratio)) if ratio < min_ratio: return False, i # return index of accepted seed that is similar to the current one return True, 0
def calculate_MI_for_seeds(decompressed_profiles_array, index_array, discr_exp_profile, nbins, min_occurences, do_print=False): MI_values_array = np.zeros(decompressed_profiles_array.shape[0], dtype=np.float32) for i, profile in enumerate(decompressed_profiles_array): active_profile = profile[index_array] if active_profile.sum() <= min_occurences: MI_values_array[i] = MASK_OUT_SEED_VALUE # print("The seed number %d binds only %d transcripts" % (i, active_profile.sum())) continue MI_values_array[i] = MI.mut_info(active_profile, discr_exp_profile, x_bins=2, y_bins=nbins) if do_print: if i % 1000 == 0 and i > 0: print("Profile number %d has been calculated" % i) MI_values_array = np.array( MI_values_array, dtype=np.float64) # make sure all elements are of the same size return MI_values_array
def optimize_motifs(seeds_initial, profiles_initial, discr_exp_profile, nbins, index_array, seqs_of_interest, args, do_print = True): seeds_optimized = copy.deepcopy(seeds_initial) profiles_optimized = np.zeros((len(seeds_initial), discr_exp_profile.shape[0]), dtype=bool) # seed_charact_array keeps MI values, p-values and z-scores seed_charact_array = np.zeros((len(seeds_initial), 3), dtype=np.float64) robustness_array = np.zeros(len(seeds_initial), dtype=bool) for i, motif in enumerate(seeds_initial): profile = profiles_initial[i] active_profile = profile[index_array] n_bestmotif = type_conversions.w_to_n_motif(seeds_initial[i]) # initial mi value init_best_MI = MI.mut_info(active_profile, discr_exp_profile, x_bins=2, y_bins=nbins) lastmyfreq = active_profile.sum() / float(active_profile.shape[0]) if do_print: w_bestmotif = type_conversions.n_to_w_motif(n_bestmotif) print("Optimzing the sequence of motif %d (sequence is %s). Initial MI = %.5f" % (i, w_bestmotif.print_sequence(return_string=True), init_best_MI)) #print("Initial frequency: %.4f" % lastmyfreq) bestmi, lastmyfreq, n_bestmotif = optimize_motif_sequence(n_bestmotif, init_best_MI, seqs_of_interest, discr_exp_profile, nbins, lastmyfreq, args, do_print = do_print, random_noseed = args.random_noseed) if do_print: print("Elongating motif %d" % i) bestmi, lastmyfreq, n_bestmotif = elongate_motif(n_bestmotif, bestmi, seqs_of_interest, discr_exp_profile, nbins, lastmyfreq, args, do_print = do_print) w_bestmotif = type_conversions.n_to_w_motif(n_bestmotif) bestmotif_profile, bestmotif_mi, pvalue, z_score = get_characteristics( n_bestmotif, seqs_of_interest, discr_exp_profile, nbins, args, do_print=do_print) if do_print: print("Checking robustness of the optimized motif %d (sequence %s)" % (i, w_bestmotif.print_sequence(return_string=True))) is_robust = check_robustness(bestmotif_profile, discr_exp_profile, nbins, args, do_print = do_print) seeds_optimized[i] = w_bestmotif profiles_optimized[i] = bestmotif_profile.values seed_charact_array[i, : ] = np.array([bestmotif_mi, pvalue, z_score], dtype=np.float64) robustness_array[i] = is_robust return seeds_optimized, profiles_optimized, \ seed_charact_array, robustness_array
def get_characteristics(n_bestmotif, seqs_of_interest, discr_exp_profile, nbins, args, do_print = False): bestmotif_profile, _time = matchmaker.calculate_profile_one_motif(n_bestmotif, seqs_of_interest, is_degenerate = True) bestmotif_mi = MI.mut_info(bestmotif_profile.values, discr_exp_profile, x_bins=2, y_bins=nbins) pvalue, z_score = statistic_tests.MI_get_pvalue_and_zscore(bestmotif_profile.values, discr_exp_profile, nbins, bestmotif_mi, args.n_permutations) if do_print: print("The final p-value is: %.4f, z-score is: %.3f" % (pvalue, z_score)) return bestmotif_profile, bestmotif_mi, pvalue, z_score
def calculate_MIs_all_seeds(profiles_passed, discr_exp_profile, index_array, nbins): MI_values_array = np.zeros(profiles_passed.shape[0], dtype=np.float32) for i, profile in enumerate(profiles_passed): active_profile = profile[index_array] MI_values_array[i] = MI.mut_info(active_profile, discr_exp_profile, x_bins=2, y_bins=nbins) return MI_values_array
def get_current_statistics(index, MI_values_array, profiles_array, index_array, discr_exp_profile, args): profile = profiles_array[index] active_profile = profile[index_array] current_MI = MI_values_array[index] if current_MI == -1: return args.max_pvalue + 0.1, args.min_zscore - 0.1 assert (np.isclose(current_MI, MI.mut_info(active_profile, discr_exp_profile), rtol=1e-10)) pvalue, z_score = statistic_tests.MI_get_pvalue_and_zscore( active_profile, discr_exp_profile, current_MI, args.n_permutations) return pvalue, z_score
def jackknife_test(active_profile, discr_exp_profile, nbins, n_permutations, max_pvalue, n_samples, fraction_retain, min_fraction_passed, do_print=False): total_number_passed = 0 for j in range(n_samples): full_indices_array = np.arange(active_profile.shape[0]) how_many_keep = int(fraction_retain * active_profile.shape[0]) subsampl_index_array = np.random.choice(full_indices_array, size=how_many_keep, replace=False) curr_profile = active_profile[subsampl_index_array] curr_exp_profile = discr_exp_profile[subsampl_index_array] curr_MI = MI.mut_info(curr_profile, curr_exp_profile, x_bins=2, y_bins=nbins) pvalue, z_score = MI_get_pvalue_and_zscore(curr_profile, discr_exp_profile, nbins, curr_MI, n_permutations) if do_print: print( "Iteration %d. p-value: %.5f; max_pvalue: %.5f, z-score: %.2f" % (j, pvalue, max_pvalue, z_score)) if pvalue < max_pvalue: total_number_passed += 1 fraction_passed = total_number_passed / float(n_samples) if do_print: print("%.2f subsamples passed the test; required fraction is %.2f" % (fraction_passed, min_fraction_passed)) if fraction_passed >= min_fraction_passed: if do_print: print("Passed robustness test") return True else: if do_print: print("Did not pass robustness test") return False
def are_there_better_motifs(n_modified_motifs, seqs_of_interest, discr_exp_profile, nbins, bestmi, n_bestmotif, lastmyfreq, args, do_print = True): for curr_motif in n_modified_motifs: current_profile, time_spent = matchmaker.calculate_profile_one_motif(curr_motif, seqs_of_interest, is_degenerate = True) myfreq = current_profile.values.sum() / float(len(seqs_of_interest)) tempmi = MI.mut_info(current_profile.values, discr_exp_profile, x_bins=2, y_bins=nbins) if tempmi > bestmi and current_profile.sum() > args.min_occurences and (myfreq < args.maxfreq or myfreq < lastmyfreq): n_bestmotif = structures.copy_n_motif(curr_motif) w_bestmotif = type_conversions.n_to_w_motif(n_bestmotif) bestmi = tempmi lastmyfreq = myfreq if do_print: print("New motif (MI = %.4f): %s" % (bestmi, w_bestmotif.print_sequence(return_string=True))) # w_bestmotif.print() # w_bestmotif.print_linear() #print("Current frequency: %.4f" % lastmyfreq) return bestmi, lastmyfreq, n_bestmotif