def test_counts(contrib_data): """ Checks that allele counts are within 95 percent confidence intervals. Data are stored in arrays with format [ind, num_alleles] = count """ ## Find the probands - they should always contribute their own alleles P = ped.Pedigree(args.pedfile) ## TODO: Test individuals with multiple offspring +p2 id:112 ## Case when no regions are specified - only all probands regions = contrib_data.keys() if len(regions) == 1: all_contribs = contrib_data[regions[0]] for prob in P.probands: ## Probands should always contribute 1 allele assert all_contribs[prob, 1] == args.iterations ## Test specific individuals counts = Counter(all_contribs[11]) binomial_sd = np.sqrt(args.iterations * 0.25) mean = np.mean(counts.values()) conf95 = (mean - 2 * binomial_sd, mean + 2 * binomial_sd) for count in counts.values(): assert conf95[0] < count < conf95[1]
def sim_data(request): """ Generate simulation data to be used in other tests """ ## Set parameters which vary between runs args.pedfile, known_genotypes, args.sim_homs, args.kinship = request.param ## Set up simulations P = ped.Pedigree(args.pedfile) ind_cones = P.allowedinds(known_genotypes.keys())[2] parent_dict = P.parent_dict.copy() signals = dict() signals[0] = 0 for ind in parent_dict: signals[ind] = 0 ancs_dict = P.ancestors_dict(known_genotypes.keys()) A = climb.AncFinder(ind_cones, parent_dict, ancs_dict, args) ## Run Simulations ancs, liks, trees = [], [], [] for i in range(args.iterations): genotypes = known_genotypes.copy() sig = signals.copy() ## Simulate coalescence of all affected alleles anc, lik, tree = A.coalesce(genotypes, sig) ancs.append(anc) liks.append(lik) trees.append(tree) yield {'ancs': ancs, 'liks': liks, 'args': args, 'trees': trees}
def simulate(args, initial_genotypes, outfile): """ Performs the specified number of climbing simulations, returning ancestors, importance sampling likelihoods, and inheritance histories. """ ## Load the pedigree and parent-offspring relationships vprint("Loading pedigree...") P = ped.Pedigree(os.path.expanduser(args.pedfile)) parent_dict = P.parent_dict ##TODO Pass the whole dictionary to improve importance sampling +t1 ind_cones = P.allowedinds(initial_genotypes.keys())[2] signals = dict() signals[0] = 0 print('create signal dict') for ind in parent_dict: signals[ind] = 0 print('create ancestors dict') ancs_dict = P.ancestors_dict(initial_genotypes.keys()) print('initialisation') A = climb.AncFinder(ind_cones, parent_dict, ancs_dict, args) ## Perform climbing simulations vprint("Performing simulations...") blocksize = 100 simdat_dict = {'Anc': [], 'Log2Lik': [], 'Tree': []} for i in range(args.iterations): if i % 100 == 0: vprint(i, "/", args.iterations) ## Make a copy of the initial genotypes so they don't get modified genotypes = initial_genotypes.copy() sig = signals.copy() ## Simulate coalescence of all affected alleles anc, lik, tree = A.coalesce(genotypes, sig) simdat_dict['Anc'].append(anc) simdat_dict['Log2Lik'].append(lik) simdat_dict['Tree'].append(tree) ## Write results to output in blocks if i % blocksize == 0: incremental_write(simdat_dict['Anc'], simdat_dict['Log2Lik'], simdat_dict['Tree'], blocksize, outfile) ## Write leftover results num_remaining = i % blocksize incremental_write(simdat_dict['Anc'], simdat_dict['Log2Lik'], simdat_dict['Tree'], num_remaining, outfile) simdat_i = pd.DataFrame(simdat_dict) simdat_i.index.name = 'Sim' return simdat_i
def main(args): pedfile = os.path.expanduser(args.pedfile) climb_lik_file = os.path.expanduser(args.climb_lik_file) control_lik_file = os.path.expanduser(args.control_lik_file) hap_length = args.haplotype_length print "Loading pedigree..." P = ped.Pedigree(pedfile) with tables.open_file(climb_lik_file, 'r') as f: shape = f.root.liks.shape haplotype_liks = np.zeros(shape) for i, tree in enumerate(f.root.trees): num_hidden_transmissions = get_num_hidden_transissions(P, tree) scale = 1. / (len(tree) - num_hidden_transmissions) haplotype_liks[i] = scipy.stats.erlang(2, scale=scale).pdf(hap_length) if i % 10000 == 0: print "Calculating haplotype likelihoods for tree number", i, \ "of", len(haplotype_liks) norm_hap_liks = np.log2(haplotype_liks / np.sum(haplotype_liks)) print "Writing combined likelihoods to file..." with tables.open_file(control_lik_file, 'a') as hapfile: tot_hap_liks = hapfile.create_carray( hapfile.root, 'tot_hap_liks', tables.FloatAtom(), shape=shape, title='Total tree likelihoods, including haplotype length') haps = hapfile.create_carray(hapfile.root, 'haplotype_liks', tables.FloatAtom(), shape=shape, title='Likelihood of observed haplotype length:' + \ str(hap_length) + 'Morgans') haps[:] = norm_hap_liks tot_liks = hapfile.root.tot_liks[:] t = norm_hap_liks + tot_liks tot_hap_liks[:] = norm_hap_liks + tot_liks
import sys, os import numpy as np import time import ped import pysignal # pedfile = os.path.expanduser('~/project/anc_finder/data/BALasc_probands1930.txt') # pedfile = os.path.expanduser('~/project/anc_finder/data/pedEx.txt') pedfile = os.path.expanduser( '~/project/anc_finder/scripts/test/test_data/pedEx2.txt') # pedfile = os.path.expanduser( # '~/project/anc_finder/scripts/test/test_data/pedEx3.txt') P = ped.Pedigree(pedfile) ped_arr = pysignal.sort_ped(P) ninds = len(P.inds) print len(P.probands), "probands" sample_idx = [P.ind_dict[x] for x in list(P.probands)[:10]] samples = np.array(sample_idx, dtype=np.int32) print "Samples:", samples cP = pysignal.cPed() cP.load_ped(ped_arr, len(samples)) cP.load_samples(samples) # cP.print_samples() # for s in samples: # cP.update_ancestor_weights(s, 1) # cP.print_nodes() cP.init_sample_weights()
def main(): parser = argparse.ArgumentParser() parser.add_argument("-o", "--outfile", help="Output file. Both symmetrical inds and " +\ "monogamous couples will be saved in the same file") parser.add_argument("-s", "--split", help="Display monogamous couples separately from " +\ "symmetrical individuals. Any file written will " +\ "still combine the two") requiredNamed = parser.add_argument_group('required named arguments') requiredNamed.add_argument("-f", "--pedfile", help="File containing the pedigree to analyze", required=True) requireOne = parser.add_argument_group('require one of') group = requireOne.add_mutually_exclusive_group(required=True) group.add_argument("-p", "--probandfile", help="File containing column of probands") group.add_argument("-l", "--probandlist", help="List of probands separated by commas (no spaces)") group.add_argument("-P", "--probandpaths-file", help="File containing paths of multiple proband files." +\ " Writes symmetries of each panel to the same " +\ "directory, in a file named 'symmetry.txt'") args = parser.parse_args() if args.probandpaths_file and args.outfile: print "Incompatible command-like options: Cannot specify outfile for", +\ "batch symmetry output." sys.exit() pedfile = os.path.expanduser(args.pedfile) if args.probandfile is not None: probands = map(int, [line.strip() for line in open(args.probandfile, 'rU')]) if args.probandlist is not None: probands = map(int, args.probandlist.split(',')) if args.probandpaths_file is not None: allprobands = [] outpaths = [] probandpaths = [ line.strip() for line in open(args.probandpaths_file, 'rU') ] for path in probandpaths: allprobands.append( map(int, [line.strip() for line in open(path, 'rU')])) outpaths.append(os.path.split(path)[0] + '/symmetry.txt') print "Loading pedigree from file:", pedfile P = ped.Pedigree(pedfile) print "Done." print "Finding symmetries..." PS = PedSymmetry(P) if args.outfile is not None: print "Probands:", probands print "Outputting symmetries to", args.outfile PS.write_symmetrical_inds(probands, args.outfile) elif args.probandpaths_file is not None: for probands, path in zip(allprobands, outpaths): print "Finding symmetries for probands:", probands PS.write_symmetrical_inds(probands, path) print "Symmetries written to:", path else: print "Probands:", probands print PS.get_symm_inds(probands)