def custom_binary_search_with_trackback(np_init_array, f, srch_val, trackback_delta=200, trackback_step=20, init_search_location=1000): flat.print_log_msg('Starting custom_binary_search_with_trackback') # One-sided binary (i.e., exponential) search first "apply f to np_init_array and check if init_search_location is smaller than srch_val" "if not, double init search val and try again" print('search_val: ', srch_val) end_v = find_end(np_init_array, f, init_search_location, srch_val) print('end_v: ', end_v) wrapper = FlexibleBoundedAccessor(np_init_array, f, 0, end_v, True) # Search with deferred detection of equality found_width_raw = binsrch.find_le_ind(wrapper, srch_val) print('found_width_raw: ', found_width_raw) found_width = end_v - found_width_raw print('found_width: ', found_width) # Find any remaining "noisy" minima found_width_trackback_raw = trackback(wrapper, srch_val, found_width_raw, trackback_delta, trackback_step) print("found_width_trackback_raw", found_width_trackback_raw) found_width_trackback = end_v - found_width_trackback_raw # Final result found_width = found_width_trackback print('found_width final: ', found_width) return found_width
def search(self): print("----- Running search") if not self.init_complete: flat.print_log_msg('init_search() must be run before search(). Starting automatically...') self.init_search() flat.print_log_msg('Starting local search...') print("addy", len(self.precomputed['data'])) print("len(locus_list)", len(self.precomputed["locus_list"])) print("locus_list", self.precomputed["locus_list"][:5], self.precomputed["locus_list"][-5:]) # In case the value itself is not in the list: try: print("hiihihih", self.snp_bottom, self.snp_top) snp_bottom_ind = binsrch.find_ge_ind(self.precomputed['locus_list'], self.snp_bottom) snp_top_ind = binsrch.find_le_ind(self.precomputed['locus_list'], self.snp_top) except Exception as e: flat.print_log_msg('Error2!') flat.print_log_msg(repr(e)) flat.print_log_msg('self.precomputed[\'locus_list\']: '+repr(self.precomputed['locus_list'])) flat.print_log_msg('self.snp_bottom: '+repr(self.snp_bottom)) flat.print_log_msg('self.snp_first: '+repr(self.snp_first)) flat.print_log_msg('self.snp_last: '+repr(self.snp_last)) flat.print_log_msg('self.snp_top: '+repr(self.snp_top)) flat.print_log_msg('self.__dict__: '+repr(self.__dict__)) flat.print_log_msg('Continuing...') return self.breakpoints[self.initial_breakpoint_index], None print("self.snp_bottom", self.snp_bottom) #, len(self.precomputed["locus_list"])) print("self.snp_top", self.snp_top) print("self.initial_breakpoint_index", self.initial_breakpoint_index) print("snp_bottom_ind", snp_bottom_ind) print("snp_top_ind", snp_top_ind) # Old: # snp_first_ind = self.precomputed['locus_list'].index(self.snp_first) # This should be snp_bottom # snp_top_ind = self.precomputed['locus_list'].index(self.snp_top) # Start from init breakpoint and search left. Then start from init_breakpoint again and search right. # We start from init_breakpoint because that's the initial sum and N that we have -> so we can use the precomputed data to incrementally check for # Find the closest locus to the breakpoint value, because a breakpoint doesn't necessarily have to be in the locus_list breakpoint_index_in_locus_list = binsrch.find_le_ind(self.precomputed['locus_list'], self.breakpoints[self.initial_breakpoint_index]) # print("breakpoint_index_in_locus_list", breakpoint_index_in_locus_list) # print("breakpoint_index_in_locus_list", self.precomputed["locus_list"]) print("breakpoint_index_in_locus_list", len(self.precomputed["locus_list"])) init_breakpoint_locus = self.precomputed['locus_list'][breakpoint_index_in_locus_list] # Old: # breakpoint_index_in_locus_list = self.precomputed['locus_list'].index(self.breakpoints[self.initial_breakpoint_index]) curr_sum = self.total_sum curr_N = self.total_N print("curr_sum", curr_sum) print("curr_N", curr_N) min_metric = decimal.Decimal(self.total_sum) / decimal.Decimal(self.total_N) min_breakpoint = None min_metric_details = {} min_metric_details['sum'] = self.total_sum min_metric_details['N_zero'] = self.total_N min_distance_right = 0 # because the initial distance of the minimum actually is 0! (until we find a new minima to the RIGHT, or we don't in which case it doesn't matter) # print("pre", self.precomputed['data'][39967768]['sum_horiz'], self.precomputed['data'][39967768]['sum_vert']) # Go RIGHT! flat.print_log_msg('Searching right...') if breakpoint_index_in_locus_list+1 < len(self.precomputed['locus_list']): curr_loc_ind = breakpoint_index_in_locus_list+1 curr_loc = self.precomputed['locus_list'][curr_loc_ind] # counter = 0 # print("self.snp_last", self.snp_last) while curr_loc <= self.snp_last: # print("curr_loc", curr_loc) # print(curr_loc, "curr_sum", curr_sum, self.precomputed['data'][curr_loc]['sum_horiz'], self.precomputed['data'][curr_loc]['sum_vert']) curr_sum = curr_sum - self.precomputed['data'][curr_loc]['sum_horiz'] + self.precomputed['data'][curr_loc]['sum_vert'] # counter += 1 # print("_N curr_loc_ind", curr_loc_ind, snp_top_ind) horiz_N = curr_loc_ind-snp_bottom_ind-1 vert_N = snp_top_ind-curr_loc_ind curr_N = curr_N - horiz_N + vert_N # print("horiz_N", horiz_N) # print("vert_N", vert_N) # print("curr_N", curr_N) curr_metric = decimal.Decimal(curr_sum) / decimal.Decimal(curr_N) # print("curr_loc", curr_loc, "curr_metric", curr_metric) if curr_metric < min_metric: min_metric = curr_metric min_breakpoint = curr_loc min_metric_details['sum'] = curr_sum min_metric_details['N_zero'] = curr_N min_distance_right = curr_loc - init_breakpoint_locus # print("min_metric", min_metric, min_breakpoint) # print("min_metric", min_metric, min_breakpoint, min_distance_right) if curr_loc_ind+1 < len(self.precomputed['locus_list']): curr_loc_ind += 1 curr_loc = self.precomputed['locus_list'][curr_loc_ind] else: flat.print_log_msg('curr_locus_index out of bounds') # The possibility of this happening is only at the end of the chromosome (end of last partition) break else: flat.print_log_msg('Warning: breakpoint_index_in_locus_list+1 < len(self.precomputed["locus_list"]) not satisfied!') flat.print_log_msg('Breakpoints: '+repr(self.breakpoints)) flat.print_log_msg('Locus_list: '+repr(self.precomputed['locus_list'])) flat.print_log_msg('breakpoint_index_in_locus_list: '+ repr(breakpoint_index_in_locus_list)) print("min_metric", min_metric, min_breakpoint, min_distance_right) # print("counter", counter) # Reset search for left curr_sum = self.total_sum curr_N = self.total_N # Go LEFT! flat.print_log_msg('Searching left...') if breakpoint_index_in_locus_list-1 >= 0: curr_loc_ind = breakpoint_index_in_locus_list-1 curr_loc = self.precomputed['locus_list'][curr_loc_ind] curr_sum = self.total_sum curr_N = self.total_N while curr_loc > self.snp_first: # Don't include previous breakpoint! curr_sum = curr_sum + self.precomputed['data'][curr_loc]['sum_horiz'] - self.precomputed['data'][curr_loc]['sum_vert'] horiz_N = curr_loc_ind-snp_bottom_ind-1 vert_N = snp_top_ind-curr_loc_ind curr_N = curr_N + horiz_N - vert_N curr_metric = decimal.Decimal(curr_sum) / decimal.Decimal(curr_N) if (curr_metric < min_metric) or (curr_metric == min_metric and (init_breakpoint_locus - curr_loc)<min_distance_right): # min_distance_right is used to compare to RIGHT metric, not within LEFT metric! min_metric = curr_metric min_breakpoint = curr_loc min_metric_details['sum'] = curr_sum min_metric_details['N_zero'] = curr_N if curr_loc_ind-1 >= 0: curr_loc_ind -= 1 curr_loc = self.precomputed['locus_list'][curr_loc_ind] else: flat.print_log_msg('curr_locus_index out of bounds') # The possibility of this happening is only at the beginning of the chromosome (start of first partition) break else: flat.print_log_msg('Warning: breakpoint_index_in_locus_list-1 >=0 not satisfied!') flat.print_log_msg('Breakpoints: '+repr(self.breakpoints)) flat.print_log_msg('Locus_list: '+repr(self.precomputed['locus_list'])) flat.print_log_msg('breakpoint_index_in_locus_list: '+ repr(breakpoint_index_in_locus_list)) self.search_complete = True flat.print_log_msg('Search done') return min_breakpoint, min_metric_details
def pipeline(input_fname, chr_name, dataset_path, n_snps_bw_bpoints, out_fname, begin=-1, end=-1, trackback_delta=200, trackback_step=20, init_search_location=1000): # print("n_snps_bw_bpoints", n_snps_bw_bpoints) # print("trackback_delta", trackback_delta) # print("trackback_step", trackback_step) config = cnst.return_conf(dataset_path) # begin, end = flat.first_last(chr_name, cnst.const[dataset], begin, end) "just reads first and last position in partitions" begin, end = flat.first_last(chr_name, config, begin, end) # READ DATA flat.print_log_msg('* Reading data') "just reads into snp pos and val into first and second list" init_array, init_array_x = rd.read_data_raw(input_fname) # print(init_array) # print(init_array_x) # Clip the input data to the required range and convert to numpy array "just a bisect left and bisect right" begin_ind = binsrch.find_ge_ind(init_array_x, begin) # = init_array_x.index(begin) end_ind = binsrch.find_le_ind(init_array_x, end) # = init_array_x.index(end) # # print("len before", len(init_array_x)) np_init_array = np.array(init_array[begin_ind:(end_ind + 1)]) np_init_array_x = np.array(init_array_x[begin_ind:(end_ind + 1)]) # print("len after", len(np_init_array_x)) # DETERMINE NUMBER OF BREAKPOINTS n_bpoints = int(math.ceil(len(np_init_array_x) / n_snps_bw_bpoints - 1)) # flat.print_log_msg('* Number of breakpoints: '+repr(n_bpoints)) # print("hiya") # result = [filt.apply_filter_get_minima(np_init_array, width) for width in range(0, 1000)] # print(result) # raise # SEARCH FOR FILTER WIDTH # flat.print_log_msg('* Starting search...') found_width = find_minima.custom_binary_search_with_trackback( np_init_array, filt.apply_filter_get_minima, n_bpoints, trackback_delta=trackback_delta, trackback_step=trackback_step, init_search_location=init_search_location) # flat.print_log_msg('* Found_width: ' + repr(found_width)) # GET MINIMA LOCATIONS flat.print_log_msg('* Applying filter and getting minima locations...') "just applies hanning to init_array" g = filt.apply_filter(np_init_array, found_width) # print("raise", g) # print("raise", np_init_array) # print("raise", np_init_array_x) breakpoint_loci = filt.get_minima_loc(g, np_init_array_x) # print("raise", breakpoint_loci) # raise # METRIC # flat.print_log_msg('* Calculating metric for non-uniform breakpoints (minima of filtered data)...') # metric_out = apply_metric(chr_name, begin, end, cnst.const[dataset], breakpoint_loci) metric_out = apply_metric(chr_name, begin, end, config, breakpoint_loci) # flat.print_log_msg('Global metric:') print("raise", metric_out) raise # print_metric(metric_out) # METRIC FOR UNIFORM BREAKPOINTS # flat.print_log_msg('* Calculating metric for uniform breakpoints...') # # step = int((end-begin)/(len(breakpoint_loci)+1)) # # breakpoint_loci_uniform = [l for l in range(begin+step, end-step+1, step)] # step = int(len(init_array_x)/(len(breakpoint_loci)+1)) # breakpoint_loci_uniform = [init_array_x[i] for i in range(step, len(init_array_x)-step+1, step)] # # metric_out_uniform = apply_metric(chr_name, begin, end, cnst.const[dataset], breakpoint_loci_uniform) # metric_out_uniform = apply_metric(chr_name, begin, end, config, breakpoint_loci_uniform) # flat.print_log_msg('Global metric:') # print_metric(metric_out_uniform) # LOCAL SEARCH ON FOURIER - missing N runs flat.print_log_msg('* Running local search for fourier...') # breakpoint_loci_local_search = run_local_search_complete(chr_name, breakpoint_loci, begin, end, cnst.const[dataset], metric_out) breakpoint_loci_local_search = run_local_search_complete( chr_name, breakpoint_loci, begin, end, config, metric_out) print(breakpoint_loci_local_search) raise # RUN METRIC AGAIN W/ NEW BREAKPOINTS FROM FOURIER LOCAL SEARCH flat.print_log_msg('* Calculating metric for new fourier breakpoints...') # metric_out_local_search = apply_metric(chr_name, begin, end, cnst.const[dataset], breakpoint_loci_local_search['loci']) metric_out_local_search = apply_metric( chr_name, begin, end, config, breakpoint_loci_local_search['loci']) flat.print_log_msg('Global metric:') print_metric(metric_out_local_search) # LOCAL SEARCH ON UNIFORM - missing N runs flat.print_log_msg('* Running local search for uniform breakpoints...') # breakpoint_loci_uniform_local_search = run_local_search_complete(chr_name, breakpoint_loci_uniform, begin, end, cnst.const[dataset], metric_out_uniform) breakpoint_loci_uniform_local_search = run_local_search_complete( chr_name, breakpoint_loci_uniform, begin, end, config, metric_out_uniform) # RUN METRIC AGAIN W/ NEW BREAKPOINTS FROM UNIFORM flat.print_log_msg('* Calculating metric for new uniform breakpoints...') # metric_out_uniform_local_search = apply_metric(chr_name, begin, end, cnst.const[dataset], breakpoint_loci_uniform_local_search['loci']) metric_out_uniform_local_search = apply_metric( chr_name, begin, end, config, breakpoint_loci_uniform_local_search['loci']) flat.print_log_msg('Global metric:') print_metric(metric_out_uniform_local_search) # DUMP DATA INTO PICKLE SO IT CAN BE ANALYZED AND LOOKED AT WITHOUT RE-RUNNING EVERYTHING pickle_out = {} pickle_out['argv'] = sys.argv pickle_out['n_bpoints'] = n_bpoints pickle_out['found_width'] = found_width pickle_out['fourier'] = {} pickle_out['fourier']['loci'] = breakpoint_loci pickle_out['fourier']['metric'] = metric_out pickle_out['uniform'] = {} pickle_out['uniform']['loci'] = breakpoint_loci_uniform pickle_out['uniform']['metric'] = metric_out_uniform pickle_out[ 'fourier_ls'] = breakpoint_loci_local_search # Yes, breakpoint_loci_local_search is already a dict with 'loci' and 'metrics' keys pickle_out['fourier_ls']['metric'] = metric_out_local_search pickle_out['uniform_ls'] = breakpoint_loci_uniform_local_search pickle_out['uniform_ls']['metric'] = metric_out_uniform_local_search t = datetime.datetime.now() t_formatted = t.strftime('%Y_%m_%d_%H_%M_%S') # pickle_dump_fname = 'pickle-'+dataset+'-'+chr_name+'-'+str(n_bpoints)+'-'+str(begin)+'-'+str(end)+'-'+t_formatted+'.pickle' with open(out_fname, 'wb') as f_out: pickle.dump(pickle_out, f_out) flat.print_log_msg('Done')