def main():
#     def __init__(self, name, snp_first, snp_last, input_config, breakpoints):
#      
    begin = 9411243
    end = 48119216
    
#     begin = 46287140
#     end = 48119216
     
    breakpoints1 = [10148322, 15250019, 15864313, 16491839, 17748811, 18252127, 18912106, 19637870, 20332293, 20929869, 21190923, 21649595, 22318833, 23231365, 24271200, 24774771, 25035980, 26088085, 27431612, 27666047, 28290149, 28485200, 28761470, 29335757, 29790442, 30972911, 32778127, 33370496, 34413058, 35253882, 35614394, 36328018, 37283402, 38078491, 39227880, 39908770, 40259482, 40965403, 41448115, 41676786, 42689700, 43100808, 43345207, 43799567, 44748107, 45265729, 45789905, 46336509, 46883153, 47465743]   
    
    # metric = Metric('chr21', cnst.const['orig_data'], breakpoints1, begin, end)
    metric = Metric('chr21', cnst.return_conf('/nethome/jkpickrell/1kG_data/covariance_matrix/'), breakpoints1, begin, end)
    out = metric.calc_metric()
    print(out)
    print(out['sum']/out['N_zero'])

    breakpoints2 = [i for i in range(begin, end+1, int((end-begin)/(len(breakpoints1)-1)))]
    
    metric = Metric('chr21', cnst.return_conf('/nethome/jkpickrell/1kG_data/covariance_matrix/'), breakpoints2, begin, end)
    out = metric.calc_metric()
    print(out)
    print(out['sum']/out['N_zero'])
    
    flat.print_log_msg('Done')
def pipeline_lean(dataset_path,
                  name,
                  out_fname,
                  begin=-1,
                  end=-1,
                  img='no',
                  orient='diag',
                  red='sum',
                  dataset_name='NONAME'):
    '''
    pipeline_lean(dataset_path, name, begin=-1, end=-1, img='no', orient='diag', red='sum')
    '''

    # analysis = matrix_to_vector.MatrixAnalysis(name, cnst.const[dataset], begin, end)
    analysis = matrix_to_vector.MatrixAnalysis(name,
                                               cnst.return_conf(dataset_path),
                                               begin, end)

    print(analysis.snp_first)
    print(analysis.snp_last)

    t = datetime.datetime.now()
    t_formatted = t.strftime('%Y_%m_%d_%H_%M_%S')

    # out_fname = 'vector-'+dataset_name+'-'+name+'-'+str(analysis.snp_first)+'-'+str(analysis.snp_last)+'-'+orient+'-'+red+'-img_'+img+'-'+t_formatted
    # out_fname += '.txt.gz'
    flat.print_log_msg('out_fname: ' + out_fname)

    if (img == 'yes'):
        generate_img = True
    elif (img == 'no'):
        generate_img = False
    else:
        raise Exception('Error: Unknown argument: ' + img)

    if (orient == 'vert'):
        analysis.calc_vert(not generate_img)
    elif (orient == 'diag'):
        analysis.calc_diag_lean(out_fname, cnst.const['out_delim'],
                                not generate_img)
    else:
        raise Exception('Error: Unknown argument: ' + orient)

    if (red == 'avg'):
        avg = True
        raise Exception(
            'Average used, but its output is not always consistent - especially for diag!'
        )
    elif (red == 'sum'):
        avg = False
    else:
        raise Exception('Error: Unknown argument: ' + red)

    # Output is done step-by-step
    # analysis.write_output_to_file(out_fname+'.txt.gz', cnst.const['out_delim'], avg)

    if generate_img:
        analysis.generate_img(out_fname + cnst.const['img_out_ext'])

    flat.print_log_msg('Done')
def main():
    breakpoints = [
        20056346, 23172864, 26207725, 27249779, 29266559, 29978822, 31322564,
        33063813, 33715859, 35472318, 36913379, 39281968, 40255964, 42098394,
        43453056, 44942909, 46458807, 48205362, 50455659, 51373940, 53447645,
        54329768, 55790298, 58012944, 58660548, 59583650, 61326949, 63053905,
        65293922, 67115837, 68982238, 70224115, 71640609, 74148469, 75436123,
        78606362, 81047897, 84207279, 87017863, 88725515, 90302399, 92152779,
        93740145, 94947530, 96546735, 97961068, 99269331, 100716423, 102054465
    ]

    breakpoint_index = 33
    total_sum = decimal.Decimal(
        '41049.603797938148512195858044319004257218538046177')
    total_N = decimal.Decimal('116785159748')

    tmp_begin = int(
        (breakpoints[breakpoint_index - 1] + breakpoints[breakpoint_index]) /
        2)
    tmp_end = int(
        (breakpoints[breakpoint_index] + breakpoints[breakpoint_index + 1]) /
        2)

    print('tmp_begin', tmp_begin, 'tmp_end', tmp_end)

    local_search_run = LocalSearch(
        'chr15', tmp_begin, tmp_end, breakpoint_index, breakpoints, total_sum,
        total_N,
        cnst.return_conf('/nethome/jkpickrell/1kG_data/covariance_matrix/'))

    new_breakpoint, new_metric = local_search_run.search()

    print(new_breakpoint, new_metric['sum'] / new_metric['N_zero'])
    print(breakpoints[breakpoint_index], total_sum / total_N)
def chr_bpoints_to_bed(name, dataset_path, subset, input_pickle_fname):
    '''
	subset is one of ['fourier', 'fourier_ls', 'uniform', 'uniform_ls']
	'''

    # input_config = cnst.const['orig_data_'+dataset]
    input_config = cnst.return_conf(dataset_path)

    partitions = flat.read_partitions(name, input_config)

    with open(input_pickle_fname, 'rb') as f_in:
        loaded = pickle.load(f_in)

        # print(loaded)

        loci = loaded[subset]['loci']

        first = partitions[0][0]
        last = partitions[len(partitions) - 1][1]

        # print(loci)

        print('chr', '\t', 'start', '\t', 'stop')

        print(name, '\t', first, '\t', loci[0])

        for i in range(0, len(loci) - 1):
            print(name, '\t', loci[i], '\t', loci[i + 1])

        print(name, '\t', loci[len(loci) - 1], '\t', last + 1)
def pipeline(dataset_path,
             name,
             out_fname,
             begin=-1,
             end=-1,
             img='no',
             orient='diag',
             red='sum',
             snp=None,
             comment='',
             dataset_name='NONAME'):
    '''
    pipeline(dataset_path, name, begin=-1, end=-1, img='no', orient='diag', red='sum', snp=None, comment='')

    snp1 and snp2 are loci of two SNPs that need to be converted into ordinal numbers representing row/col in image of matrix
    '''

    # analysis = matrix_to_vector.MatrixAnalysis(name, cnst.const[dataset], begin, end)
    analysis = matrix_to_vector.MatrixAnalysis(name,
                                               cnst.return_conf(dataset_path),
                                               begin, end)

    print(analysis.snp_first)
    print(analysis.snp_last)

    if (img == 'yes'):
        generate_img = True
    elif (img == 'no'):
        generate_img = False
    else:
        raise Exception('Error: Unknown argument: ' + img)

    if (orient == 'vert'):
        analysis.calc_vert(not generate_img)
    elif (orient == 'diag'):
        analysis.calc_diag(not generate_img)
    else:
        raise Exception('Error: Unknown argument: ' + orient)

    if (red == 'avg'):
        avg = True
        raise Exception(
            'Average used, but its output is not always consistent - especially for diag!'
        )
    elif (red == 'sum'):
        avg = False
    else:
        raise Exception('Error: Unknown argument: ' + red)

    t = datetime.datetime.now()
    t_formatted = t.strftime('%Y_%m_%d_%H_%M_%S')

    # out_fname = 'vector-'+dataset_name+'-'+name+'-'+comment+'-'+str(analysis.snp_first)+'-'+str(analysis.snp_last)+'-'+orient+'-'+red+'-img_'+img+'-'+t_formatted

    analysis.write_output_to_file(out_fname, cnst.const['out_delim'], avg)

    if generate_img:
        # flat.print_log_msg('x_values: '+repr(x_values))
        if snp is not None:
            analysis.generate_img(
                'img-' + out_fname + cnst.const['img_out_ext'], snp)
        else:
            analysis.generate_img('img-' + out_fname +
                                  cnst.const['img_out_ext'])

    flat.print_log_msg('Done')
Example #6
0
def pipeline(input_fname,
             chr_name,
             dataset_path,
             n_snps_bw_bpoints,
             out_fname,
             begin=-1,
             end=-1,
             trackback_delta=200,
             trackback_step=20,
             init_search_location=1000):
    # print("n_snps_bw_bpoints", n_snps_bw_bpoints)
    # print("trackback_delta", trackback_delta)
    # print("trackback_step", trackback_step)
    config = cnst.return_conf(dataset_path)
    # begin, end = flat.first_last(chr_name, cnst.const[dataset], begin, end)
    "just reads first and last position in partitions"
    begin, end = flat.first_last(chr_name, config, begin, end)
    # READ DATA
    flat.print_log_msg('* Reading data')

    "just reads into snp pos and val into first and second list"
    init_array, init_array_x = rd.read_data_raw(input_fname)
    # print(init_array)
    # print(init_array_x)

    # Clip the input data to the required range and convert to numpy array
    "just a bisect left and bisect right"
    begin_ind = binsrch.find_ge_ind(init_array_x,
                                    begin)  # = init_array_x.index(begin)
    end_ind = binsrch.find_le_ind(init_array_x,
                                  end)  # = init_array_x.index(end)
    #
    # print("len before", len(init_array_x))
    np_init_array = np.array(init_array[begin_ind:(end_ind + 1)])
    np_init_array_x = np.array(init_array_x[begin_ind:(end_ind + 1)])
    # print("len after", len(np_init_array_x))

    # DETERMINE NUMBER OF BREAKPOINTS
    n_bpoints = int(math.ceil(len(np_init_array_x) / n_snps_bw_bpoints - 1))
    # flat.print_log_msg('* Number of breakpoints: '+repr(n_bpoints))

    # print("hiya")
    # result = [filt.apply_filter_get_minima(np_init_array, width) for width in range(0, 1000)]
    # print(result)
    # raise
    # SEARCH FOR FILTER WIDTH
    # flat.print_log_msg('* Starting search...')
    found_width = find_minima.custom_binary_search_with_trackback(
        np_init_array,
        filt.apply_filter_get_minima,
        n_bpoints,
        trackback_delta=trackback_delta,
        trackback_step=trackback_step,
        init_search_location=init_search_location)
    # flat.print_log_msg('* Found_width: ' + repr(found_width))

    # GET MINIMA LOCATIONS
    flat.print_log_msg('* Applying filter and getting minima locations...')

    "just applies hanning to init_array"
    g = filt.apply_filter(np_init_array, found_width)
    # print("raise", g)
    # print("raise", np_init_array)
    # print("raise", np_init_array_x)
    breakpoint_loci = filt.get_minima_loc(g, np_init_array_x)
    # print("raise", breakpoint_loci)
    # raise

    # METRIC
    # flat.print_log_msg('* Calculating metric for non-uniform breakpoints (minima of filtered data)...')

    # metric_out = apply_metric(chr_name, begin, end, cnst.const[dataset], breakpoint_loci)
    metric_out = apply_metric(chr_name, begin, end, config, breakpoint_loci)
    # flat.print_log_msg('Global metric:')
    print("raise", metric_out)
    raise

    # print_metric(metric_out)

    # METRIC FOR UNIFORM BREAKPOINTS
    # flat.print_log_msg('* Calculating metric for uniform breakpoints...')
    # # step = int((end-begin)/(len(breakpoint_loci)+1))
    # # breakpoint_loci_uniform = [l for l in range(begin+step, end-step+1, step)]
    # step = int(len(init_array_x)/(len(breakpoint_loci)+1))
    # breakpoint_loci_uniform = [init_array_x[i] for i in range(step, len(init_array_x)-step+1, step)]

    # # metric_out_uniform = apply_metric(chr_name, begin, end, cnst.const[dataset], breakpoint_loci_uniform)
    # metric_out_uniform = apply_metric(chr_name, begin, end, config, breakpoint_loci_uniform)
    # flat.print_log_msg('Global metric:')
    # print_metric(metric_out_uniform)

    # LOCAL SEARCH ON FOURIER - missing N runs
    flat.print_log_msg('* Running local search for fourier...')

    # breakpoint_loci_local_search = run_local_search_complete(chr_name, breakpoint_loci, begin, end, cnst.const[dataset], metric_out)
    breakpoint_loci_local_search = run_local_search_complete(
        chr_name, breakpoint_loci, begin, end, config, metric_out)
    print(breakpoint_loci_local_search)
    raise

    # RUN METRIC AGAIN W/ NEW BREAKPOINTS FROM FOURIER LOCAL SEARCH
    flat.print_log_msg('* Calculating metric for new fourier breakpoints...')

    # metric_out_local_search = apply_metric(chr_name, begin, end, cnst.const[dataset], breakpoint_loci_local_search['loci'])
    metric_out_local_search = apply_metric(
        chr_name, begin, end, config, breakpoint_loci_local_search['loci'])
    flat.print_log_msg('Global metric:')
    print_metric(metric_out_local_search)

    # LOCAL SEARCH ON UNIFORM - missing N runs
    flat.print_log_msg('* Running local search for uniform breakpoints...')

    # breakpoint_loci_uniform_local_search = run_local_search_complete(chr_name, breakpoint_loci_uniform, begin, end, cnst.const[dataset], metric_out_uniform)
    breakpoint_loci_uniform_local_search = run_local_search_complete(
        chr_name, breakpoint_loci_uniform, begin, end, config,
        metric_out_uniform)

    # RUN METRIC AGAIN W/ NEW BREAKPOINTS FROM UNIFORM
    flat.print_log_msg('* Calculating metric for new uniform breakpoints...')

    # metric_out_uniform_local_search = apply_metric(chr_name, begin, end, cnst.const[dataset], breakpoint_loci_uniform_local_search['loci'])
    metric_out_uniform_local_search = apply_metric(
        chr_name, begin, end, config,
        breakpoint_loci_uniform_local_search['loci'])
    flat.print_log_msg('Global metric:')
    print_metric(metric_out_uniform_local_search)

    # DUMP DATA INTO PICKLE SO IT CAN BE ANALYZED AND LOOKED AT WITHOUT RE-RUNNING EVERYTHING
    pickle_out = {}
    pickle_out['argv'] = sys.argv
    pickle_out['n_bpoints'] = n_bpoints
    pickle_out['found_width'] = found_width
    pickle_out['fourier'] = {}
    pickle_out['fourier']['loci'] = breakpoint_loci
    pickle_out['fourier']['metric'] = metric_out
    pickle_out['uniform'] = {}
    pickle_out['uniform']['loci'] = breakpoint_loci_uniform
    pickle_out['uniform']['metric'] = metric_out_uniform
    pickle_out[
        'fourier_ls'] = breakpoint_loci_local_search  # Yes, breakpoint_loci_local_search is already a dict with 'loci' and 'metrics' keys
    pickle_out['fourier_ls']['metric'] = metric_out_local_search
    pickle_out['uniform_ls'] = breakpoint_loci_uniform_local_search
    pickle_out['uniform_ls']['metric'] = metric_out_uniform_local_search

    t = datetime.datetime.now()
    t_formatted = t.strftime('%Y_%m_%d_%H_%M_%S')

    # pickle_dump_fname = 'pickle-'+dataset+'-'+chr_name+'-'+str(n_bpoints)+'-'+str(begin)+'-'+str(end)+'-'+t_formatted+'.pickle'
    with open(out_fname, 'wb') as f_out:
        pickle.dump(pickle_out, f_out)

    flat.print_log_msg('Done')