-
Notifications
You must be signed in to change notification settings - Fork 0
/
sim-ibd-pedigree.py
184 lines (158 loc) · 7.66 KB
/
sim-ibd-pedigree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/python
description = '''Simulate IBD segments in a diploid population.
infile should contain code defining
sampsizes -- a dict whose values are the sample sizes
ancnefn(t) -- a function returning a dict whose keys are population names and whose values are effective population sizes in generation t
migprobs(t) -- a function returning a dict whose keys are pairs (tuples) of population names (x,y)
and whose values are backwards migration rates from x to y in generation t
Here pop is a dict whose keys give the names of the populations (the same as the keys of sampsizes)
and if ancnefn or migprobs can also be dicts rather than functions (i.e. constant).
Some of these can be passed in on the command line.
Peak memory usage is:
pop: [ number of breakpoints in sample ] -> float
and [ number of breakpoints in sample ] -> int
recombdict: [ number of breakpoints in ancestors ] -> float
parentdict: ploidy * [ number of ancestors ] -> int
and [ number of breakpoints in sample ] = x =
ploidy * (number of samples) * ( (number of generations)*(total genome length) + (number of chromosomes) )
and [ number of ancestors ] ~ [ effective pop size ] * ( 1- exp(-[number of samples]/[effective pop size]) )
and [ number of breakpoints in ancestors ] = ploidy * ( total genome length + number of chromosome ) * [ number of ancestors ]
Example:
python sim-ibd-pedigree.py -i sim-demographics-2.py -t 10 -b test.ibd.gz -l test.log
'''
from optparse import OptionParser
import coalpedigree as coal
import re
import time
import subprocess, os, sys
import signal
# import pdb
parser = OptionParser(description=description)
# parser.add_option("-c","--coalfile",dest="coalfile",help="name of file to write final coalescent info to (or '-' for stdout)",default="-")
parser.add_option("-b","--ibdfile",dest="ibdfile",help="name of file to write final ibd blocks to (or '-' for stdout)",default="-")
parser.add_option("-l","--logfile",dest="logfile",help="name of log file (or '-' for stdout)",default="-")
parser.add_option("-i","--infile",dest="infile",help="name of input file to get parameters from (or '-' for stdin)")
parser.add_option("-t","--ngens",dest="ngens",help="total number of generations to simulate",default="10")
parser.add_option("-n","--nesize",dest="nesize",help="effective population size")
parser.add_option("-m","--migprob",dest="migprob",help="migration probability")
parser.add_option("-s","--samplesizes",dest="sampsizes",help="sample sizes")
parser.add_option("-e","--minlen",dest="minlen",help="minimum length of IBD block to record IN MORGANS (default value 0.005M = 0.5cM)",default=None)
parser.add_option("-g","--gaplen",dest="gaplen",help="gap length, IN MORGANS: blocks closer together than this will be recorded even if shorter than minlen (default value 0.5M = 50cM)",default=None)
(options,args) = parser.parse_args()
# read in parameters etc
if options.infile is not None:
infile = coal.fileopt(options.infile,"r")
inparams = infile.read()
exec(inparams)
# command line options supercede statements in infile
ibdfile = coal.fileopt(options.ibdfile, "w")
# coalfile = coal.fileopt(options.coalfile, "w")
logfile = coal.fileopt(options.logfile, "w")
ngens = int(options.ngens)
if options.nesize is not None:
ancnefn = lambda t: {}.fromkeys(pop.keys(),int(options.nesize))
else:
try:
if type(ancnefn) == type({}):
ancnefn = lambda t: ancnefn
except NameError:
print("Effective pop sizes (ancne) needs to be specified on the command line (-n) or in the input file (-i).")
raise
if options.migprob is not None:
migprobs = lambda t: {}.fromkeys([(x,y) for x in pop.keys() for y in pop.keys()],float(options.migprob))
else:
try:
if type(migprobs) == type({}):
migprobs = lambda t: migprobs
except NameError:
print("Migration rates (migprobs) need to be specified on the command line (-m) or in the input file (-i).")
raise
if options.sampsizes is not None:
sampsizes = map(int,re.split("[, ]*",options.sampsizes))
sampsizes = dict( zip(range(len(sampsizes)),sampsizes) )
else:
try:
if not type(sampsizes) ==type({}):
raise TypeError("sampsizes should be a dict!")
except NameError:
raise TypeError("Sampsizes must be set in infile or on command line.")
if options.minlen is not None:
minlen = float( options.minlen )
try:
if not type(minlen) == type(0.0):
raise TypeError("minlen is not defined: " + str(minlen))
except NameError:
# default value
minlen = 0.005
if options.gaplen is not None:
gaplen = float( options.gaplen )
try:
if not type(gaplen) == type(0.0):
raise TypeError("gaplen is not defined: " + str(gaplen))
except NameError:
# default value
gaplen = 0.5
# catch ctrl-c gracefully
_exitnow = []
def catch_int(signal,frame):
_exitnow.append(True)
if len(_exitnow)>1:
logfile.write("Caught SIGINT twice, terminating immediately.\n")
logfile.flush()
raise SystemExit
else:
logfile.write("Caught SIGINT, exiting after this generation. SIGINT again to terminate.\n")
logfile.flush()
pass
signal.signal( signal.SIGINT, catch_int )
# initialize
pop = coal.initpop(sampsizes)
# sanity checks
mignames = reduce( lambda x,y: x+y, [ [u,v] for (u,v) in migprobs(t=1).keys() ] )
ancnenames = ancnefn(t=1).keys()
if (not set(mignames) == set(ancnenames)):
raise TypeError("Inconsistent population names -- using command-line samplesizes?")
# record "version number"
githash, giterr = subprocess.Popen(["git",'--git-dir='+os.path.abspath(os.path.dirname(sys.argv[0])+'/.git'),'rev-parse','HEAD'], stdout=subprocess.PIPE).communicate()
if giterr:
githash = "(none available)"
logfile.write("sim-ibd-pedigree.py -- githash " + githash + time.strftime("%d %h %Y %H:%M:%S", time.localtime()) + "\n")
logfile.write("\n")
logfile.write("options "+str(options)+"\n")
logfile.write("\n")
# logfile.write("coal output: " + str(options.coalfile)+"\n")
logfile.write("ibd output: " + str(options.ibdfile)+"\n")
logfile.write("input: " + str(options.infile)+"\n")
logfile.write("--------------------------------\n")
logfile.write(inparams)
logfile.write("--------------------------------\n")
logfile.write("ngens: " + str(ngens)+"\n")
logfile.write("sampsizes: " + str(sampsizes)+"\n")
logfile.write("minlen: " + str(minlen)+"\n")
logfile.write("gaplen: " + str(gaplen)+"\n")
logfile.write("Ne at 1: " + str(ancnefn(t=1))+"\n")
logfile.write("migprob at 1: " + str(migprobs(t=1))+"\n")
logfile.write("chromosome ending positions: " + str(list(coal.chrpos)+[coal.chrlen]) + "\n")
logfile.write("\n")
logfile.write("Beginning ------------\n")
# here is where the action happens
for t in xrange(ngens):
logfile.write(" t="+str(t)+"\n")
if t%10==0:
logfile.write(" census (num segments, num indivs): " + str(coal.census(pop,sampsizes=sampsizes))+ "\n")
logfile.flush()
coal.parents(pop,ancne=ancnefn(t),migprobs=migprobs(t),t=t)
if _exitnow:
# there's been a ctrl-c; stop now.
break
logfile.write(" census (num indivs, num segments): " + str(coal.census(pop,sampsizes=sampsizes))+ "\n")
logfile.write("Done with simulation at " + time.strftime("%d %h %Y %H:%M:%S", time.localtime()) + "; now writing out IBD info.\n" )
coal.writeibd(pop,minlen=minlen,gaplen=gaplen,outfile=ibdfile)
# writecoal(ibdict,outfile=coalfile)
# pdb.set_trace()
logfile.write("\n")
logfile.write("Closing ibd file...\n")
ibdfile.close()
logfile.write("All done at " + time.strftime("%d %h %Y %H:%M:%S", time.localtime()) + "\n" )
# coalfile.close()
logfile.close()