Esempio n. 1
0
def custom_binary_search_with_trackback(np_init_array,
                                        f,
                                        srch_val,
                                        trackback_delta=200,
                                        trackback_step=20,
                                        init_search_location=1000):
    flat.print_log_msg('Starting custom_binary_search_with_trackback')

    # One-sided binary (i.e., exponential) search first
    "apply f to np_init_array and check if init_search_location is smaller than srch_val"
    "if not, double init search val and try again"

    print('search_val: ', srch_val)
    end_v = find_end(np_init_array, f, init_search_location, srch_val)
    print('end_v: ', end_v)
    wrapper = FlexibleBoundedAccessor(np_init_array, f, 0, end_v, True)

    # Search with deferred detection of equality
    found_width_raw = binsrch.find_le_ind(wrapper, srch_val)
    print('found_width_raw: ', found_width_raw)
    found_width = end_v - found_width_raw
    print('found_width: ', found_width)

    # Find any remaining "noisy" minima
    found_width_trackback_raw = trackback(wrapper, srch_val, found_width_raw,
                                          trackback_delta, trackback_step)
    print("found_width_trackback_raw", found_width_trackback_raw)
    found_width_trackback = end_v - found_width_trackback_raw

    # Final result
    found_width = found_width_trackback
    print('found_width final: ', found_width)

    return found_width
Esempio n. 2
0
    def search(self):
        print("----- Running search")
        if not self.init_complete:
            flat.print_log_msg('init_search() must be run before search(). Starting automatically...')
            self.init_search()
            
        flat.print_log_msg('Starting local search...')

        print("addy", len(self.precomputed['data']))

        print("len(locus_list)", len(self.precomputed["locus_list"]))
        print("locus_list", self.precomputed["locus_list"][:5], self.precomputed["locus_list"][-5:])
        # In case the value itself is not in the list:
        try:
            print("hiihihih", self.snp_bottom, self.snp_top)
            snp_bottom_ind = binsrch.find_ge_ind(self.precomputed['locus_list'], self.snp_bottom)
            snp_top_ind = binsrch.find_le_ind(self.precomputed['locus_list'], self.snp_top)
        except Exception as e:
            flat.print_log_msg('Error2!')
            flat.print_log_msg(repr(e))
            flat.print_log_msg('self.precomputed[\'locus_list\']: '+repr(self.precomputed['locus_list']))
            flat.print_log_msg('self.snp_bottom: '+repr(self.snp_bottom))
            flat.print_log_msg('self.snp_first: '+repr(self.snp_first))
            flat.print_log_msg('self.snp_last: '+repr(self.snp_last))
            flat.print_log_msg('self.snp_top: '+repr(self.snp_top))
            flat.print_log_msg('self.__dict__: '+repr(self.__dict__))
            flat.print_log_msg('Continuing...')
            return self.breakpoints[self.initial_breakpoint_index], None


        print("self.snp_bottom", self.snp_bottom) #, len(self.precomputed["locus_list"]))
        print("self.snp_top", self.snp_top)
        print("self.initial_breakpoint_index", self.initial_breakpoint_index)
        print("snp_bottom_ind", snp_bottom_ind)
        print("snp_top_ind", snp_top_ind)

        # Old:
        # snp_first_ind = self.precomputed['locus_list'].index(self.snp_first) # This should be snp_bottom
        # snp_top_ind = self.precomputed['locus_list'].index(self.snp_top) 
        
        # Start from init breakpoint and search left. Then start from init_breakpoint again and search right.
        # We start from init_breakpoint because that's the initial sum and N that we have -> so we can use the precomputed data to incrementally check for 
        # Find the closest locus to the breakpoint value, because a breakpoint doesn't necessarily have to be in the locus_list
        breakpoint_index_in_locus_list = binsrch.find_le_ind(self.precomputed['locus_list'], self.breakpoints[self.initial_breakpoint_index])
        # print("breakpoint_index_in_locus_list", breakpoint_index_in_locus_list)
        # print("breakpoint_index_in_locus_list", self.precomputed["locus_list"])
        print("breakpoint_index_in_locus_list", len(self.precomputed["locus_list"]))
        init_breakpoint_locus = self.precomputed['locus_list'][breakpoint_index_in_locus_list]
        # Old:
        # breakpoint_index_in_locus_list = self.precomputed['locus_list'].index(self.breakpoints[self.initial_breakpoint_index])

        curr_sum = self.total_sum
        curr_N = self.total_N
        print("curr_sum", curr_sum)
        print("curr_N", curr_N)

        min_metric = decimal.Decimal(self.total_sum) / decimal.Decimal(self.total_N)
        min_breakpoint = None

        min_metric_details = {}
        min_metric_details['sum'] = self.total_sum
        min_metric_details['N_zero'] = self.total_N
        min_distance_right = 0 # because the initial distance of the minimum actually is 0! (until we find a new minima to the RIGHT, or we don't in which case it doesn't matter)
        # print("pre", self.precomputed['data'][39967768]['sum_horiz'], self.precomputed['data'][39967768]['sum_vert'])


        # Go RIGHT!
        flat.print_log_msg('Searching right...')
        if breakpoint_index_in_locus_list+1 < len(self.precomputed['locus_list']):
            curr_loc_ind = breakpoint_index_in_locus_list+1
            curr_loc = self.precomputed['locus_list'][curr_loc_ind]

            # counter = 0
            # print("self.snp_last", self.snp_last)
            while curr_loc <= self.snp_last:
                # print("curr_loc", curr_loc)
                # print(curr_loc, "curr_sum", curr_sum, self.precomputed['data'][curr_loc]['sum_horiz'], self.precomputed['data'][curr_loc]['sum_vert'])
                curr_sum = curr_sum - self.precomputed['data'][curr_loc]['sum_horiz'] + self.precomputed['data'][curr_loc]['sum_vert']
                # counter += 1
                
                # print("_N curr_loc_ind", curr_loc_ind, snp_top_ind)
                horiz_N = curr_loc_ind-snp_bottom_ind-1
                vert_N = snp_top_ind-curr_loc_ind
                curr_N = curr_N - horiz_N + vert_N
                # print("horiz_N", horiz_N)
                # print("vert_N", vert_N)
                # print("curr_N", curr_N)
                
                curr_metric = decimal.Decimal(curr_sum) / decimal.Decimal(curr_N)
                # print("curr_loc", curr_loc, "curr_metric", curr_metric)
                
                if curr_metric < min_metric:
                    min_metric = curr_metric
                    min_breakpoint = curr_loc
                    min_metric_details['sum'] = curr_sum
                    min_metric_details['N_zero'] = curr_N
                    min_distance_right = curr_loc - init_breakpoint_locus
                    # print("min_metric", min_metric, min_breakpoint)
                    # print("min_metric", min_metric, min_breakpoint, min_distance_right)

                
                if curr_loc_ind+1 < len(self.precomputed['locus_list']):
                    curr_loc_ind += 1
                    curr_loc = self.precomputed['locus_list'][curr_loc_ind]
                else:
                    flat.print_log_msg('curr_locus_index out of bounds') # The possibility of this happening is only at the end of the chromosome (end of last partition)
                    break
        else:
            flat.print_log_msg('Warning: breakpoint_index_in_locus_list+1 < len(self.precomputed["locus_list"]) not satisfied!')
            flat.print_log_msg('Breakpoints: '+repr(self.breakpoints))
            flat.print_log_msg('Locus_list: '+repr(self.precomputed['locus_list']))
            flat.print_log_msg('breakpoint_index_in_locus_list: '+ repr(breakpoint_index_in_locus_list))
        
        print("min_metric", min_metric, min_breakpoint, min_distance_right)

        # print("counter", counter)
        # Reset search for left
        curr_sum = self.total_sum
        curr_N = self.total_N

        # Go LEFT!    
        flat.print_log_msg('Searching left...')
        if breakpoint_index_in_locus_list-1 >= 0:
            curr_loc_ind = breakpoint_index_in_locus_list-1
            curr_loc = self.precomputed['locus_list'][curr_loc_ind]
            
            curr_sum = self.total_sum
            curr_N = self.total_N
            
            while curr_loc > self.snp_first: # Don't include previous breakpoint!
                curr_sum = curr_sum + self.precomputed['data'][curr_loc]['sum_horiz'] - self.precomputed['data'][curr_loc]['sum_vert']
                
                horiz_N = curr_loc_ind-snp_bottom_ind-1
                vert_N = snp_top_ind-curr_loc_ind
                curr_N = curr_N + horiz_N - vert_N
                
                curr_metric = decimal.Decimal(curr_sum) / decimal.Decimal(curr_N)
                
                if (curr_metric < min_metric) or (curr_metric == min_metric and (init_breakpoint_locus - curr_loc)<min_distance_right): # min_distance_right is used to compare to RIGHT metric, not within LEFT metric!
                    min_metric = curr_metric
                    min_breakpoint = curr_loc
                    min_metric_details['sum'] = curr_sum
                    min_metric_details['N_zero'] = curr_N

                if curr_loc_ind-1 >= 0:
                    curr_loc_ind -= 1
                    curr_loc = self.precomputed['locus_list'][curr_loc_ind]
                else:
                    flat.print_log_msg('curr_locus_index out of bounds') # The possibility of this happening is only at the beginning of the chromosome (start of first partition)
                    break
        else:
            flat.print_log_msg('Warning: breakpoint_index_in_locus_list-1 >=0 not satisfied!')
            flat.print_log_msg('Breakpoints: '+repr(self.breakpoints))
            flat.print_log_msg('Locus_list: '+repr(self.precomputed['locus_list']))
            flat.print_log_msg('breakpoint_index_in_locus_list: '+ repr(breakpoint_index_in_locus_list))
        
        self.search_complete = True
        
        flat.print_log_msg('Search done')
        
        return min_breakpoint, min_metric_details
Esempio n. 3
0
def pipeline(input_fname,
             chr_name,
             dataset_path,
             n_snps_bw_bpoints,
             out_fname,
             begin=-1,
             end=-1,
             trackback_delta=200,
             trackback_step=20,
             init_search_location=1000):
    # print("n_snps_bw_bpoints", n_snps_bw_bpoints)
    # print("trackback_delta", trackback_delta)
    # print("trackback_step", trackback_step)
    config = cnst.return_conf(dataset_path)
    # begin, end = flat.first_last(chr_name, cnst.const[dataset], begin, end)
    "just reads first and last position in partitions"
    begin, end = flat.first_last(chr_name, config, begin, end)
    # READ DATA
    flat.print_log_msg('* Reading data')

    "just reads into snp pos and val into first and second list"
    init_array, init_array_x = rd.read_data_raw(input_fname)
    # print(init_array)
    # print(init_array_x)

    # Clip the input data to the required range and convert to numpy array
    "just a bisect left and bisect right"
    begin_ind = binsrch.find_ge_ind(init_array_x,
                                    begin)  # = init_array_x.index(begin)
    end_ind = binsrch.find_le_ind(init_array_x,
                                  end)  # = init_array_x.index(end)
    #
    # print("len before", len(init_array_x))
    np_init_array = np.array(init_array[begin_ind:(end_ind + 1)])
    np_init_array_x = np.array(init_array_x[begin_ind:(end_ind + 1)])
    # print("len after", len(np_init_array_x))

    # DETERMINE NUMBER OF BREAKPOINTS
    n_bpoints = int(math.ceil(len(np_init_array_x) / n_snps_bw_bpoints - 1))
    # flat.print_log_msg('* Number of breakpoints: '+repr(n_bpoints))

    # print("hiya")
    # result = [filt.apply_filter_get_minima(np_init_array, width) for width in range(0, 1000)]
    # print(result)
    # raise
    # SEARCH FOR FILTER WIDTH
    # flat.print_log_msg('* Starting search...')
    found_width = find_minima.custom_binary_search_with_trackback(
        np_init_array,
        filt.apply_filter_get_minima,
        n_bpoints,
        trackback_delta=trackback_delta,
        trackback_step=trackback_step,
        init_search_location=init_search_location)
    # flat.print_log_msg('* Found_width: ' + repr(found_width))

    # GET MINIMA LOCATIONS
    flat.print_log_msg('* Applying filter and getting minima locations...')

    "just applies hanning to init_array"
    g = filt.apply_filter(np_init_array, found_width)
    # print("raise", g)
    # print("raise", np_init_array)
    # print("raise", np_init_array_x)
    breakpoint_loci = filt.get_minima_loc(g, np_init_array_x)
    # print("raise", breakpoint_loci)
    # raise

    # METRIC
    # flat.print_log_msg('* Calculating metric for non-uniform breakpoints (minima of filtered data)...')

    # metric_out = apply_metric(chr_name, begin, end, cnst.const[dataset], breakpoint_loci)
    metric_out = apply_metric(chr_name, begin, end, config, breakpoint_loci)
    # flat.print_log_msg('Global metric:')
    print("raise", metric_out)
    raise

    # print_metric(metric_out)

    # METRIC FOR UNIFORM BREAKPOINTS
    # flat.print_log_msg('* Calculating metric for uniform breakpoints...')
    # # step = int((end-begin)/(len(breakpoint_loci)+1))
    # # breakpoint_loci_uniform = [l for l in range(begin+step, end-step+1, step)]
    # step = int(len(init_array_x)/(len(breakpoint_loci)+1))
    # breakpoint_loci_uniform = [init_array_x[i] for i in range(step, len(init_array_x)-step+1, step)]

    # # metric_out_uniform = apply_metric(chr_name, begin, end, cnst.const[dataset], breakpoint_loci_uniform)
    # metric_out_uniform = apply_metric(chr_name, begin, end, config, breakpoint_loci_uniform)
    # flat.print_log_msg('Global metric:')
    # print_metric(metric_out_uniform)

    # LOCAL SEARCH ON FOURIER - missing N runs
    flat.print_log_msg('* Running local search for fourier...')

    # breakpoint_loci_local_search = run_local_search_complete(chr_name, breakpoint_loci, begin, end, cnst.const[dataset], metric_out)
    breakpoint_loci_local_search = run_local_search_complete(
        chr_name, breakpoint_loci, begin, end, config, metric_out)
    print(breakpoint_loci_local_search)
    raise

    # RUN METRIC AGAIN W/ NEW BREAKPOINTS FROM FOURIER LOCAL SEARCH
    flat.print_log_msg('* Calculating metric for new fourier breakpoints...')

    # metric_out_local_search = apply_metric(chr_name, begin, end, cnst.const[dataset], breakpoint_loci_local_search['loci'])
    metric_out_local_search = apply_metric(
        chr_name, begin, end, config, breakpoint_loci_local_search['loci'])
    flat.print_log_msg('Global metric:')
    print_metric(metric_out_local_search)

    # LOCAL SEARCH ON UNIFORM - missing N runs
    flat.print_log_msg('* Running local search for uniform breakpoints...')

    # breakpoint_loci_uniform_local_search = run_local_search_complete(chr_name, breakpoint_loci_uniform, begin, end, cnst.const[dataset], metric_out_uniform)
    breakpoint_loci_uniform_local_search = run_local_search_complete(
        chr_name, breakpoint_loci_uniform, begin, end, config,
        metric_out_uniform)

    # RUN METRIC AGAIN W/ NEW BREAKPOINTS FROM UNIFORM
    flat.print_log_msg('* Calculating metric for new uniform breakpoints...')

    # metric_out_uniform_local_search = apply_metric(chr_name, begin, end, cnst.const[dataset], breakpoint_loci_uniform_local_search['loci'])
    metric_out_uniform_local_search = apply_metric(
        chr_name, begin, end, config,
        breakpoint_loci_uniform_local_search['loci'])
    flat.print_log_msg('Global metric:')
    print_metric(metric_out_uniform_local_search)

    # DUMP DATA INTO PICKLE SO IT CAN BE ANALYZED AND LOOKED AT WITHOUT RE-RUNNING EVERYTHING
    pickle_out = {}
    pickle_out['argv'] = sys.argv
    pickle_out['n_bpoints'] = n_bpoints
    pickle_out['found_width'] = found_width
    pickle_out['fourier'] = {}
    pickle_out['fourier']['loci'] = breakpoint_loci
    pickle_out['fourier']['metric'] = metric_out
    pickle_out['uniform'] = {}
    pickle_out['uniform']['loci'] = breakpoint_loci_uniform
    pickle_out['uniform']['metric'] = metric_out_uniform
    pickle_out[
        'fourier_ls'] = breakpoint_loci_local_search  # Yes, breakpoint_loci_local_search is already a dict with 'loci' and 'metrics' keys
    pickle_out['fourier_ls']['metric'] = metric_out_local_search
    pickle_out['uniform_ls'] = breakpoint_loci_uniform_local_search
    pickle_out['uniform_ls']['metric'] = metric_out_uniform_local_search

    t = datetime.datetime.now()
    t_formatted = t.strftime('%Y_%m_%d_%H_%M_%S')

    # pickle_dump_fname = 'pickle-'+dataset+'-'+chr_name+'-'+str(n_bpoints)+'-'+str(begin)+'-'+str(end)+'-'+t_formatted+'.pickle'
    with open(out_fname, 'wb') as f_out:
        pickle.dump(pickle_out, f_out)

    flat.print_log_msg('Done')