def main(): # def __init__(self, name, snp_first, snp_last, input_config, breakpoints): # begin = 9411243 end = 48119216 # begin = 46287140 # end = 48119216 breakpoints1 = [10148322, 15250019, 15864313, 16491839, 17748811, 18252127, 18912106, 19637870, 20332293, 20929869, 21190923, 21649595, 22318833, 23231365, 24271200, 24774771, 25035980, 26088085, 27431612, 27666047, 28290149, 28485200, 28761470, 29335757, 29790442, 30972911, 32778127, 33370496, 34413058, 35253882, 35614394, 36328018, 37283402, 38078491, 39227880, 39908770, 40259482, 40965403, 41448115, 41676786, 42689700, 43100808, 43345207, 43799567, 44748107, 45265729, 45789905, 46336509, 46883153, 47465743] # metric = Metric('chr21', cnst.const['orig_data'], breakpoints1, begin, end) metric = Metric('chr21', cnst.return_conf('/nethome/jkpickrell/1kG_data/covariance_matrix/'), breakpoints1, begin, end) out = metric.calc_metric() print(out) print(out['sum']/out['N_zero']) breakpoints2 = [i for i in range(begin, end+1, int((end-begin)/(len(breakpoints1)-1)))] metric = Metric('chr21', cnst.return_conf('/nethome/jkpickrell/1kG_data/covariance_matrix/'), breakpoints2, begin, end) out = metric.calc_metric() print(out) print(out['sum']/out['N_zero']) flat.print_log_msg('Done')
def pipeline_lean(dataset_path, name, out_fname, begin=-1, end=-1, img='no', orient='diag', red='sum', dataset_name='NONAME'): ''' pipeline_lean(dataset_path, name, begin=-1, end=-1, img='no', orient='diag', red='sum') ''' # analysis = matrix_to_vector.MatrixAnalysis(name, cnst.const[dataset], begin, end) analysis = matrix_to_vector.MatrixAnalysis(name, cnst.return_conf(dataset_path), begin, end) print(analysis.snp_first) print(analysis.snp_last) t = datetime.datetime.now() t_formatted = t.strftime('%Y_%m_%d_%H_%M_%S') # out_fname = 'vector-'+dataset_name+'-'+name+'-'+str(analysis.snp_first)+'-'+str(analysis.snp_last)+'-'+orient+'-'+red+'-img_'+img+'-'+t_formatted # out_fname += '.txt.gz' flat.print_log_msg('out_fname: ' + out_fname) if (img == 'yes'): generate_img = True elif (img == 'no'): generate_img = False else: raise Exception('Error: Unknown argument: ' + img) if (orient == 'vert'): analysis.calc_vert(not generate_img) elif (orient == 'diag'): analysis.calc_diag_lean(out_fname, cnst.const['out_delim'], not generate_img) else: raise Exception('Error: Unknown argument: ' + orient) if (red == 'avg'): avg = True raise Exception( 'Average used, but its output is not always consistent - especially for diag!' ) elif (red == 'sum'): avg = False else: raise Exception('Error: Unknown argument: ' + red) # Output is done step-by-step # analysis.write_output_to_file(out_fname+'.txt.gz', cnst.const['out_delim'], avg) if generate_img: analysis.generate_img(out_fname + cnst.const['img_out_ext']) flat.print_log_msg('Done')
def main(): breakpoints = [ 20056346, 23172864, 26207725, 27249779, 29266559, 29978822, 31322564, 33063813, 33715859, 35472318, 36913379, 39281968, 40255964, 42098394, 43453056, 44942909, 46458807, 48205362, 50455659, 51373940, 53447645, 54329768, 55790298, 58012944, 58660548, 59583650, 61326949, 63053905, 65293922, 67115837, 68982238, 70224115, 71640609, 74148469, 75436123, 78606362, 81047897, 84207279, 87017863, 88725515, 90302399, 92152779, 93740145, 94947530, 96546735, 97961068, 99269331, 100716423, 102054465 ] breakpoint_index = 33 total_sum = decimal.Decimal( '41049.603797938148512195858044319004257218538046177') total_N = decimal.Decimal('116785159748') tmp_begin = int( (breakpoints[breakpoint_index - 1] + breakpoints[breakpoint_index]) / 2) tmp_end = int( (breakpoints[breakpoint_index] + breakpoints[breakpoint_index + 1]) / 2) print('tmp_begin', tmp_begin, 'tmp_end', tmp_end) local_search_run = LocalSearch( 'chr15', tmp_begin, tmp_end, breakpoint_index, breakpoints, total_sum, total_N, cnst.return_conf('/nethome/jkpickrell/1kG_data/covariance_matrix/')) new_breakpoint, new_metric = local_search_run.search() print(new_breakpoint, new_metric['sum'] / new_metric['N_zero']) print(breakpoints[breakpoint_index], total_sum / total_N)
def chr_bpoints_to_bed(name, dataset_path, subset, input_pickle_fname): ''' subset is one of ['fourier', 'fourier_ls', 'uniform', 'uniform_ls'] ''' # input_config = cnst.const['orig_data_'+dataset] input_config = cnst.return_conf(dataset_path) partitions = flat.read_partitions(name, input_config) with open(input_pickle_fname, 'rb') as f_in: loaded = pickle.load(f_in) # print(loaded) loci = loaded[subset]['loci'] first = partitions[0][0] last = partitions[len(partitions) - 1][1] # print(loci) print('chr', '\t', 'start', '\t', 'stop') print(name, '\t', first, '\t', loci[0]) for i in range(0, len(loci) - 1): print(name, '\t', loci[i], '\t', loci[i + 1]) print(name, '\t', loci[len(loci) - 1], '\t', last + 1)
def pipeline(dataset_path, name, out_fname, begin=-1, end=-1, img='no', orient='diag', red='sum', snp=None, comment='', dataset_name='NONAME'): ''' pipeline(dataset_path, name, begin=-1, end=-1, img='no', orient='diag', red='sum', snp=None, comment='') snp1 and snp2 are loci of two SNPs that need to be converted into ordinal numbers representing row/col in image of matrix ''' # analysis = matrix_to_vector.MatrixAnalysis(name, cnst.const[dataset], begin, end) analysis = matrix_to_vector.MatrixAnalysis(name, cnst.return_conf(dataset_path), begin, end) print(analysis.snp_first) print(analysis.snp_last) if (img == 'yes'): generate_img = True elif (img == 'no'): generate_img = False else: raise Exception('Error: Unknown argument: ' + img) if (orient == 'vert'): analysis.calc_vert(not generate_img) elif (orient == 'diag'): analysis.calc_diag(not generate_img) else: raise Exception('Error: Unknown argument: ' + orient) if (red == 'avg'): avg = True raise Exception( 'Average used, but its output is not always consistent - especially for diag!' ) elif (red == 'sum'): avg = False else: raise Exception('Error: Unknown argument: ' + red) t = datetime.datetime.now() t_formatted = t.strftime('%Y_%m_%d_%H_%M_%S') # out_fname = 'vector-'+dataset_name+'-'+name+'-'+comment+'-'+str(analysis.snp_first)+'-'+str(analysis.snp_last)+'-'+orient+'-'+red+'-img_'+img+'-'+t_formatted analysis.write_output_to_file(out_fname, cnst.const['out_delim'], avg) if generate_img: # flat.print_log_msg('x_values: '+repr(x_values)) if snp is not None: analysis.generate_img( 'img-' + out_fname + cnst.const['img_out_ext'], snp) else: analysis.generate_img('img-' + out_fname + cnst.const['img_out_ext']) flat.print_log_msg('Done')
def pipeline(input_fname, chr_name, dataset_path, n_snps_bw_bpoints, out_fname, begin=-1, end=-1, trackback_delta=200, trackback_step=20, init_search_location=1000): # print("n_snps_bw_bpoints", n_snps_bw_bpoints) # print("trackback_delta", trackback_delta) # print("trackback_step", trackback_step) config = cnst.return_conf(dataset_path) # begin, end = flat.first_last(chr_name, cnst.const[dataset], begin, end) "just reads first and last position in partitions" begin, end = flat.first_last(chr_name, config, begin, end) # READ DATA flat.print_log_msg('* Reading data') "just reads into snp pos and val into first and second list" init_array, init_array_x = rd.read_data_raw(input_fname) # print(init_array) # print(init_array_x) # Clip the input data to the required range and convert to numpy array "just a bisect left and bisect right" begin_ind = binsrch.find_ge_ind(init_array_x, begin) # = init_array_x.index(begin) end_ind = binsrch.find_le_ind(init_array_x, end) # = init_array_x.index(end) # # print("len before", len(init_array_x)) np_init_array = np.array(init_array[begin_ind:(end_ind + 1)]) np_init_array_x = np.array(init_array_x[begin_ind:(end_ind + 1)]) # print("len after", len(np_init_array_x)) # DETERMINE NUMBER OF BREAKPOINTS n_bpoints = int(math.ceil(len(np_init_array_x) / n_snps_bw_bpoints - 1)) # flat.print_log_msg('* Number of breakpoints: '+repr(n_bpoints)) # print("hiya") # result = [filt.apply_filter_get_minima(np_init_array, width) for width in range(0, 1000)] # print(result) # raise # SEARCH FOR FILTER WIDTH # flat.print_log_msg('* Starting search...') found_width = find_minima.custom_binary_search_with_trackback( np_init_array, filt.apply_filter_get_minima, n_bpoints, trackback_delta=trackback_delta, trackback_step=trackback_step, init_search_location=init_search_location) # flat.print_log_msg('* Found_width: ' + repr(found_width)) # GET MINIMA LOCATIONS flat.print_log_msg('* Applying filter and getting minima locations...') "just applies hanning to init_array" g = filt.apply_filter(np_init_array, found_width) # print("raise", g) # print("raise", np_init_array) # print("raise", np_init_array_x) breakpoint_loci = filt.get_minima_loc(g, np_init_array_x) # print("raise", breakpoint_loci) # raise # METRIC # flat.print_log_msg('* Calculating metric for non-uniform breakpoints (minima of filtered data)...') # metric_out = apply_metric(chr_name, begin, end, cnst.const[dataset], breakpoint_loci) metric_out = apply_metric(chr_name, begin, end, config, breakpoint_loci) # flat.print_log_msg('Global metric:') print("raise", metric_out) raise # print_metric(metric_out) # METRIC FOR UNIFORM BREAKPOINTS # flat.print_log_msg('* Calculating metric for uniform breakpoints...') # # step = int((end-begin)/(len(breakpoint_loci)+1)) # # breakpoint_loci_uniform = [l for l in range(begin+step, end-step+1, step)] # step = int(len(init_array_x)/(len(breakpoint_loci)+1)) # breakpoint_loci_uniform = [init_array_x[i] for i in range(step, len(init_array_x)-step+1, step)] # # metric_out_uniform = apply_metric(chr_name, begin, end, cnst.const[dataset], breakpoint_loci_uniform) # metric_out_uniform = apply_metric(chr_name, begin, end, config, breakpoint_loci_uniform) # flat.print_log_msg('Global metric:') # print_metric(metric_out_uniform) # LOCAL SEARCH ON FOURIER - missing N runs flat.print_log_msg('* Running local search for fourier...') # breakpoint_loci_local_search = run_local_search_complete(chr_name, breakpoint_loci, begin, end, cnst.const[dataset], metric_out) breakpoint_loci_local_search = run_local_search_complete( chr_name, breakpoint_loci, begin, end, config, metric_out) print(breakpoint_loci_local_search) raise # RUN METRIC AGAIN W/ NEW BREAKPOINTS FROM FOURIER LOCAL SEARCH flat.print_log_msg('* Calculating metric for new fourier breakpoints...') # metric_out_local_search = apply_metric(chr_name, begin, end, cnst.const[dataset], breakpoint_loci_local_search['loci']) metric_out_local_search = apply_metric( chr_name, begin, end, config, breakpoint_loci_local_search['loci']) flat.print_log_msg('Global metric:') print_metric(metric_out_local_search) # LOCAL SEARCH ON UNIFORM - missing N runs flat.print_log_msg('* Running local search for uniform breakpoints...') # breakpoint_loci_uniform_local_search = run_local_search_complete(chr_name, breakpoint_loci_uniform, begin, end, cnst.const[dataset], metric_out_uniform) breakpoint_loci_uniform_local_search = run_local_search_complete( chr_name, breakpoint_loci_uniform, begin, end, config, metric_out_uniform) # RUN METRIC AGAIN W/ NEW BREAKPOINTS FROM UNIFORM flat.print_log_msg('* Calculating metric for new uniform breakpoints...') # metric_out_uniform_local_search = apply_metric(chr_name, begin, end, cnst.const[dataset], breakpoint_loci_uniform_local_search['loci']) metric_out_uniform_local_search = apply_metric( chr_name, begin, end, config, breakpoint_loci_uniform_local_search['loci']) flat.print_log_msg('Global metric:') print_metric(metric_out_uniform_local_search) # DUMP DATA INTO PICKLE SO IT CAN BE ANALYZED AND LOOKED AT WITHOUT RE-RUNNING EVERYTHING pickle_out = {} pickle_out['argv'] = sys.argv pickle_out['n_bpoints'] = n_bpoints pickle_out['found_width'] = found_width pickle_out['fourier'] = {} pickle_out['fourier']['loci'] = breakpoint_loci pickle_out['fourier']['metric'] = metric_out pickle_out['uniform'] = {} pickle_out['uniform']['loci'] = breakpoint_loci_uniform pickle_out['uniform']['metric'] = metric_out_uniform pickle_out[ 'fourier_ls'] = breakpoint_loci_local_search # Yes, breakpoint_loci_local_search is already a dict with 'loci' and 'metrics' keys pickle_out['fourier_ls']['metric'] = metric_out_local_search pickle_out['uniform_ls'] = breakpoint_loci_uniform_local_search pickle_out['uniform_ls']['metric'] = metric_out_uniform_local_search t = datetime.datetime.now() t_formatted = t.strftime('%Y_%m_%d_%H_%M_%S') # pickle_dump_fname = 'pickle-'+dataset+'-'+chr_name+'-'+str(n_bpoints)+'-'+str(begin)+'-'+str(end)+'-'+t_formatted+'.pickle' with open(out_fname, 'wb') as f_out: pickle.dump(pickle_out, f_out) flat.print_log_msg('Done')