def test_read_msmc(): true_sizes = [ 0.00086526447673996, 9.79088625144171e-05, 0.000171382542974173, 0.000457984501804459, 0.000272927253969727, 0.000131071906047658, 6.82882858274492e-05, 4.10030998343475e-05, 2.79371748811273e-05, 2.12348489352847e-05, 1.99020025394955e-05, 1.99020025394955e-05, 2.2919799039202e-05, 2.2919799039202e-05, 2.98841094236551e-05, 2.98841094236551e-05, 4.04164511122607e-05, 4.04164511122607e-05, 5.57810348943842e-05, 5.57810348943842e-05, 8.00343187158654e-05, 8.00343187158654e-05, 0.000116691016192045, 0.000116691016192045, 0.000167278907468, 0.000167278907468, 0.000226324223028942, 0.000226324223028942, 0.000287970327537451, 0.000287970327537451, 0.000345006037605658, 0.000345006037605658, 0.000392215310516861, 0.000392215310516861, 0.000424300540558889, 0.000424300540558889, 0.000447447313078885, 0.000447447313078885, 0.000841088166225936, 0.000841088166225936 ] true_times = [ 1.58858e-06, 3.21843e-06, 4.89174e-06, 6.61091e-06, 8.3785e-06, 1.01973e-05, 1.20705e-05, 1.40013e-05, 1.59934e-05, 1.80508e-05, 2.01779e-05, 2.23798e-05, 2.46617e-05, 2.70297e-05, 2.94906e-05, 3.2052e-05, 3.47225e-05, 3.75116e-05, 4.04305e-05, 4.34919e-05, 4.67103e-05, 5.01028e-05, 5.36893e-05, 5.74932e-05, 6.15427e-05, 6.58717e-05, 7.05216e-05, 7.5544e-05, 8.10035e-05, 8.69838e-05, 9.35947e-05, 0.000100985, 0.000109364, 0.000119036, 0.000130476, 0.000144477, 0.000162528, 0.000187969, 0.000231461 ] sizes, times = read_msmc(joinpath(THIS_DIR, 'msmc_test.final.txt'), 1.0) assert np.allclose(sizes, true_sizes) assert np.allclose(times, true_times) sizes, times = read_msmc(joinpath(THIS_DIR, 'msmc_test.final.txt'), 1.25e-8) assert np.allclose(sizes, np.array(true_sizes) / 1.25e-8) assert np.allclose(times, np.array(true_times) / 1.25e-8)
def test_issue6(): sizes, times = read_msmc(joinpath(THIS_DIR, 'issue6_msmc.txt'), 1.0) N_ref = .0005 / 4. times = [t / (2. * N_ref) for t in times] sizes = [p / N_ref for p in sizes] new_sizes, new_times = decimate_sizes(sizes, times, 0.0, None) assert np.abs(new_times[0] - 1.85199e-07 / (2. * N_ref)) < 1e-13
def _main(args): n_ref = 0.0005 / (4. * args.mu) if args.msmc_file: if args.smcpp_file or args.epochtimes or args.popsizes: raise IOError('Can only specify one of msmc_file, smcpp_file, or ' 'popsizes') pop_sizes, times = read_msmc(args.msmc_file, args.mu) elif args.smcpp_file: if args.msmc_file or args.epochtimes or args.popsizes: raise IOError('Can only specify one of msmc_file, smcpp_file, or ' 'popsizes') pop_sizes, times = read_smcpp(args.smcpp_file) else: times = [] if args.epochtimes: times = list(map(float, args.epochtimes.split(','))) pop_sizes = list(map(float, args.popsizes.split(','))) times = [t / (2. * n_ref) for t in times] pop_sizes = [p / n_ref for p in pop_sizes] if len(pop_sizes) != len(times)+1: raise IOError('Number of population sizes must match ' 'number of epochs.') pop_sizes, times = decimate_sizes(pop_sizes, times, args.decimate_rel_tol, args.decimate_anc_size) logging.info('Size history to be used when computing lookup table is\n' + 'Scaled Size\tScaled Left Time\tScaled Right Time\n' + '\n'.join([str(p) + '\t' + str(t1) + '\t' + str(t2) for p, t1, t2 in zip(pop_sizes, [0] + times, times + [float('inf')])])) max_size = args.samplesize num_particles = max_size if args.moran_pop_size: if not args.approx: raise IOError('Cannot use moran_pop_size when computing an exact ' 'lookup table. Turn off --aprox flag.') if max_size > args.moran_pop_size: raise IOError('moran_pop_size must be at least as large as the ' 'desired sample size.') num_particles = args.moran_pop_size rho_grid = [i * .1 for i in range(100)] + list(range(10, 101)) logging.info('Beginning Lookup Table. This may take a while') table = LookupTable(num_particles, 0.0005, rho_grid, pop_sizes, times, not args.approx, args.numthreads, store_stationary=args.store_stationary, load_stationary=args.load_stationary).table logging.info('\t...complete') table.columns /= 4. * n_ref if num_particles > max_size: logging.info('Downsampling') table = downsample(table, max_size) logging.info('\t...complete') table.to_hdf(args.outfile, 'ldtable', mode='w')
def _main(args): table = read_hdf(args.tablefile, 'ldtable') table_size = sum(map(int, table.index.values[0].split())) if table_size < args.samplesize: raise IOError('Lookup table was constructed for {} haploids, ' 'but --samplesize is {} haploids. Either build ' 'a larger lookup table or simulate fewer ' 'individuals.'.format(table_size, args.samplesize)) max_rho = table.columns[-1] table.columns *= 100. / max_rho block_penalties = list(map(float, args.blockpenalty.split(','))) window_sizes = list(map(float, args.windowsize.split(','))) logging.info('Searching over Windowsizes %s, and Block Penalties %s', window_sizes, block_penalties) if args.msmc_file: if args.smcpp_file or args.epochtimes or args.popsizes: raise IOError('Can only specify one of msmc_file, smcpp_file, or ' 'popsizes') pop_sizes, times = read_msmc(args.msmc_file, args.mu) elif args.smcpp_file: if args.msmc_file or args.epochtimes or args.popsizes: raise IOError('Can only specify one of msmc_file, smcpp_file, or ' 'popsizes') pop_sizes, times = read_smcpp(args.smcpp_file) else: pop_sizes = list(map(float, args.popsizes.split(','))) times = [] if args.epochtimes: times = list(map(float, args.epochtimes.split(','))) if len(pop_sizes) != len(times) + 1: raise IOError('Number of population sizes must ' 'match number of epochs.') pop_sizes, times = decimate_sizes(pop_sizes, times, args.decimate_rel_tol, args.decimate_anc_size) pop_config = [ msprime.PopulationConfiguration(sample_size=args.samplesize, initial_size=pop_sizes[0])] demography = [] if times: for pop_size, time in zip(pop_sizes[1:], times): demography.append( msprime.PopulationParametersChange(time=time * 2, initial_size=pop_size, population_id=0)) reco_maps = _load_hapmap() pool = Pool(args.numthreads, maxtasksperchild=100) logging.info('Simulating data...') simulation_args = [((pop_config, args.mu, demography, args.ploidy), reco_maps) for k in range(args.num_sims)] test_set = list(pool.imap(_simulate_data, simulation_args, chunksize=10)) logging.info('\tdone simulating') scores = {} for block_penalty in block_penalties: for window_size in window_sizes: estimates = list(pool.imap(partial(_call_optimize, metawindow=args.metawindow, windowsize=window_size, table=table, ploidy=args.ploidy, bpen=block_penalty, overlap=args.overlap, max_rho=max_rho), test_set, chunksize=10)) scores[(block_penalty, window_size)] = _score(estimates, [ts[1] for ts in test_set], [ts[2] for ts in test_set], pool) ofile = open(args.outfile, 'w') if args.outfile else sys.stdout ofile.write('\t'.join(['Block_Penalty', 'Window_Size', 'Pearson_Corr_1bp', 'Pearson_Corr_10kb', 'Pearson_Corr_100kb', 'Log_Pearson_Corr_1bp', 'Log_Pearson_Corr_10kb', 'Log_Pearson_Corr_100kb', 'Spearman_Corr_1bp', 'Spearman_Corr_10kb', 'Spearman_Corr_100kb', 'L2', 'Log_L2']) + '\n') for block_penalty, window_size in sorted(scores): line = ([block_penalty, window_size] + scores[block_penalty, window_size]) ofile.write('\t'.join(map(str, line)) + '\n') if args.outfile: ofile.close()