def _main(args): n_ref = 0.0005 / (4. * args.mu) if args.msmc_file: if args.smcpp_file or args.epochtimes or args.popsizes: raise IOError('Can only specify one of msmc_file, smcpp_file, or ' 'popsizes') pop_sizes, times = read_msmc(args.msmc_file, args.mu) elif args.smcpp_file: if args.msmc_file or args.epochtimes or args.popsizes: raise IOError('Can only specify one of msmc_file, smcpp_file, or ' 'popsizes') pop_sizes, times = read_smcpp(args.smcpp_file) else: times = [] if args.epochtimes: times = list(map(float, args.epochtimes.split(','))) pop_sizes = list(map(float, args.popsizes.split(','))) times = [t / (2. * n_ref) for t in times] pop_sizes = [p / n_ref for p in pop_sizes] if len(pop_sizes) != len(times)+1: raise IOError('Number of population sizes must match ' 'number of epochs.') pop_sizes, times = decimate_sizes(pop_sizes, times, args.decimate_rel_tol, args.decimate_anc_size) logging.info('Size history to be used when computing lookup table is\n' + 'Scaled Size\tScaled Left Time\tScaled Right Time\n' + '\n'.join([str(p) + '\t' + str(t1) + '\t' + str(t2) for p, t1, t2 in zip(pop_sizes, [0] + times, times + [float('inf')])])) max_size = args.samplesize num_particles = max_size if args.moran_pop_size: if not args.approx: raise IOError('Cannot use moran_pop_size when computing an exact ' 'lookup table. Turn off --aprox flag.') if max_size > args.moran_pop_size: raise IOError('moran_pop_size must be at least as large as the ' 'desired sample size.') num_particles = args.moran_pop_size rho_grid = [i * .1 for i in range(100)] + list(range(10, 101)) logging.info('Beginning Lookup Table. This may take a while') table = LookupTable(num_particles, 0.0005, rho_grid, pop_sizes, times, not args.approx, args.numthreads, store_stationary=args.store_stationary, load_stationary=args.load_stationary).table logging.info('\t...complete') table.columns /= 4. * n_ref if num_particles > max_size: logging.info('Downsampling') table = downsample(table, max_size) logging.info('\t...complete') table.to_hdf(args.outfile, 'ldtable', mode='w')
def test_read_smcpp(): true_sizes_start = [ 138482.84333082315, 138482.84333082315, 139331.82583178935 ] true_sizes_end = [ 19408.187247411068, 20959.43140840318, 23058.569473392425 ] true_times_start = [50.0, 53.97505585700569, 58.2661330953377] true_times_end = [83485.36048509754, 90122.53990850793, 97287.38251073883] sizes, times = read_smcpp(joinpath(THIS_DIR, 'ACB_pop_sizes.csv')) assert np.allclose(sizes[0:3], true_sizes_start) assert np.allclose(sizes[-3:], true_sizes_end) assert np.allclose(times[0:3], true_times_start) assert np.allclose(times[-3:], true_times_end)
def test_decimate_sizes(): sizes, times = read_smcpp(joinpath(THIS_DIR, 'ACB_pop_sizes.csv')) new_sizes, new_times = decimate_sizes(sizes, times, 0.0, None) assert np.allclose(sizes[1:], new_sizes) assert np.allclose(times[1:], new_times) new_sizes, new_times = decimate_sizes(sizes, times, 0.0, 1.0) assert np.allclose(sizes[1:-1], new_sizes[:-1]) assert np.allclose(1.0, new_sizes[-1]) assert np.allclose(times[1:], new_times) new_sizes, new_times = decimate_sizes(sizes, times, 0.25, None) new_idx = 0 for idx, t in enumerate(times): if t > new_times[new_idx]: new_idx += 1 assert new_times[new_idx] >= t rel_error = np.abs((sizes[idx] - new_sizes[new_idx])) rel_error /= sizes[idx] assert rel_error < 0.25
def _main(args): table = read_hdf(args.tablefile, 'ldtable') table_size = sum(map(int, table.index.values[0].split())) if table_size < args.samplesize: raise IOError('Lookup table was constructed for {} haploids, ' 'but --samplesize is {} haploids. Either build ' 'a larger lookup table or simulate fewer ' 'individuals.'.format(table_size, args.samplesize)) max_rho = table.columns[-1] table.columns *= 100. / max_rho block_penalties = list(map(float, args.blockpenalty.split(','))) window_sizes = list(map(float, args.windowsize.split(','))) logging.info('Searching over Windowsizes %s, and Block Penalties %s', window_sizes, block_penalties) if args.msmc_file: if args.smcpp_file or args.epochtimes or args.popsizes: raise IOError('Can only specify one of msmc_file, smcpp_file, or ' 'popsizes') pop_sizes, times = read_msmc(args.msmc_file, args.mu) elif args.smcpp_file: if args.msmc_file or args.epochtimes or args.popsizes: raise IOError('Can only specify one of msmc_file, smcpp_file, or ' 'popsizes') pop_sizes, times = read_smcpp(args.smcpp_file) else: pop_sizes = list(map(float, args.popsizes.split(','))) times = [] if args.epochtimes: times = list(map(float, args.epochtimes.split(','))) if len(pop_sizes) != len(times) + 1: raise IOError('Number of population sizes must ' 'match number of epochs.') pop_sizes, times = decimate_sizes(pop_sizes, times, args.decimate_rel_tol, args.decimate_anc_size) pop_config = [ msprime.PopulationConfiguration(sample_size=args.samplesize, initial_size=pop_sizes[0])] demography = [] if times: for pop_size, time in zip(pop_sizes[1:], times): demography.append( msprime.PopulationParametersChange(time=time * 2, initial_size=pop_size, population_id=0)) reco_maps = _load_hapmap() pool = Pool(args.numthreads, maxtasksperchild=100) logging.info('Simulating data...') simulation_args = [((pop_config, args.mu, demography, args.ploidy), reco_maps) for k in range(args.num_sims)] test_set = list(pool.imap(_simulate_data, simulation_args, chunksize=10)) logging.info('\tdone simulating') scores = {} for block_penalty in block_penalties: for window_size in window_sizes: estimates = list(pool.imap(partial(_call_optimize, metawindow=args.metawindow, windowsize=window_size, table=table, ploidy=args.ploidy, bpen=block_penalty, overlap=args.overlap, max_rho=max_rho), test_set, chunksize=10)) scores[(block_penalty, window_size)] = _score(estimates, [ts[1] for ts in test_set], [ts[2] for ts in test_set], pool) ofile = open(args.outfile, 'w') if args.outfile else sys.stdout ofile.write('\t'.join(['Block_Penalty', 'Window_Size', 'Pearson_Corr_1bp', 'Pearson_Corr_10kb', 'Pearson_Corr_100kb', 'Log_Pearson_Corr_1bp', 'Log_Pearson_Corr_10kb', 'Log_Pearson_Corr_100kb', 'Spearman_Corr_1bp', 'Spearman_Corr_10kb', 'Spearman_Corr_100kb', 'L2', 'Log_L2']) + '\n') for block_penalty, window_size in sorted(scores): line = ([block_penalty, window_size] + scores[block_penalty, window_size]) ofile.write('\t'.join(map(str, line)) + '\n') if args.outfile: ofile.close()