/
rostat_numarray.py
377 lines (342 loc) · 10.8 KB
/
rostat_numarray.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
# rostat.py
# median = MDIAN1, biwt = XBIWT, mad = XMAD
# converted to python by hand from rostat.f
# by Steven Bamford, 03 June 2004
# confidence interval output added to biwt from
# subroutine CONFIDENCE in rostat.f
# t-tests adapted for biweight from Numerical
# Recipes functions with dof_biwt = 0.7*dof
# fit by minimising MAD adapted from
# Numerical Recipes
# ks-test adapted from Numerical Recipes
import numarray as N
import nr_numarray as nr
from math import sqrt, exp
import pickle
import sys
from os.path import join as pjoin
def median(x, sorted=0):
if not sorted: x = N.sort(x,axis=0)
n = len(x)
n2 = n/2
if 2*n2 == n:
xmed = 0.5*(x[n2-1]+x[n2])
else:
xmed = x[n2]
return xmed
def quartiles(x, sorted=0):
if not sorted: x = N.sort(x)
n = len(x)
n2 = n/2
n4 = n/4
if 2*n2 == n:
xlq = 0.75*x[n4] + 0.25*x[n4+1]
xhq = 0.25*x[3*n4] + 0.75*x[3*n4+1]
else:
xlq = x[n4]
xhq = x[3*n4]
return (xlq, xhq)
def biwt(xdata):
# The subroutine XBIWT provides an estimator of the location and
# scale of the data set XDATA. The scale uses the Biweight function
# in the general formula of "A-estimators." This formula is given
# on page of 416 in UREDA (formula 4). The BIWEIGHT scale estimate
# is returned as the value XSBIWT. The BIWEIGHT function is given
# by:
#
# u((1-u*u)**2) abs(u) <= 1
# f(u) =
# 0 abs(u) > 1
#
# where u is defined by
#
# u = (XDATA[i] - M) / c*MAD .
#
# M, MAD, and c are the median, the median absolute deviation from
# the median, and the tuning constant respectively. The tuning
# constant is a parameter which is chosen depending on the sample
# size and the specific function being used for the scale estimate.
# (See page 417 in UREDA). Here we take c = 9.0.
#
# The biweght location is found using the formula:
#
# T = M + (sums)
#
# where M is the sample median and sums are
# as given on page 421 in UREDA
#
# the tuning constant c is set to 6.0 for calculation
# of the location as reccommended by Tukey ()
#
# NOTE that the biweight is meant to be an iterated estimator, but one
# commonly only takes the first step of the iteration. Here we report
# both the one-step estimators (XLBIWT1, XSBIWT1) and the preferred
# fully iterated versions (XLBIWT, XSBIWT).
n = len(xdata)
small = 0.0001
# sort the data and find the median
xm = median(xdata)
# call xmad to find the median absolute deviation
xmadm = mad(xdata, xm)
# must choose value of the tuning constant "c"
# here c = 6.0 for the location estimator and
# 9.0 for the scale estimator
c1 = 6.0
c2 = 9.0
if (xmadm <= small):
xlb = xm
xsb = xmadm
else:
xlb = xm
xsb = xmadm
u1 = N.zeros(xdata.shape, N.Float32)
u2 = N.zeros(xdata.shape, N.Float32)
xlb_change = xsb_change = 1.0
while xlb_change > small and xsb_change > small:
for i in range(n):
u1[i] = (xdata[i] - xlb)/(c1*xmadm)
u2[i] = (xdata[i] - xlb)/(c2*xmadm)
s1 = s2 = s3 = s4 = 0.0
for i in range(n):
if (abs(u2[i]) < 1.0):
s1 = s1 + (xdata[i] - xlb)**2 * (1.0 - u2[i]**2)**4
s2 = s2 + (1.0 - u2[i]**2) * (1.0 - 5.0*u2[i]**2)
if (abs(u1[i]) < 1.0):
s3 = s3 + (xdata[i] - xlb) * (1.0 - u1[i]**2)**2
s4 = s4 + (1.0 - u1[i]**2)**2
xlb_new = xlb + s3/s4
xsb_new = (float(n)/float(n-1)**0.5) * (s1**0.5 / abs(s2))
xlb_change = abs((xlb_new - xlb) / xlb_new)
xsb_change = abs((xsb_new - xsb) / xsb_new)
xlb = xlb_new
xsb = xsb_new
# load t-table created by make_t_table.py
t_table_file = file('/tmp/t_table.pickle')
t_table = pickle.load(t_table_file)
t_table_file.close()
# calculate confidence intervals
ndof = n-1
idof = int(0.7*ndof)
tsigma = [1.00, 1.64, 1.96, 2.58]
tprob = [0.68, 0.90, 0.95, 0.99]
xtsc = []
for tp in tprob:
tpt = 0.5 + tp/2.0
tvalue = t_table.t(idof,tpt)
xtsc.append(tvalue*xsb/n**0.5)
ci = N.array([tsigma, tprob, xtsc])
return xlb, xsb, ci
def mad(xdata, xmed):
# The XMAD subroutine calculates the Median Absolute Deviation from
# the sample median. The median, M , is subtracted from each
# ORDERED statistic and then the absolute value is taken. This new
# set of of statistics is then resorted so that they are ORDERED
# statistics. The MAD is then defined to be the median of this
# new set of statistics and is returned as XMADM. The MAD can
# be defined:
#
# XMADM = median{ abs(x[i] - M) }
#
# where the x[i] are the values passed in the array XDATA, and
# the median, M, is passed in the array XLETTER. The set of stats
# in the brackets is assumed to be resorted. For more information
# see page 408 in UREDA.
n = len(xdata)
dhalf, n1, n2 = (0.5, 1, 2)
xdata2 = N.absolute(xdata - xmed)
xdata2 = N.sort(xdata2,0)
if (float(n)/float(n2) - int(n/n2) == 0):
i1 = n/n2
i2 = n/n2 - n1
xmadm = dhalf*(xdata2[i1] + xdata2[i2])
else:
i1 = n/n2
xmadm = xdata2[i1]
return xmadm
def biwt_ttest(n1, ave1, var1, n2, ave2, var2):
df = 0.7 * (n1+n2-2)
svar = ((n1-1)*var1+(n2-1)*var2)/df
t = (ave1-ave2)/sqrt(svar*(1.0/n1+1.0/n2))
prob = nr.betai(0.5*df,0.5,df/(df+t**2))
return t, prob
def biwt_tutest(n1, ave1, var1, n2, ave2, var2):
t = (ave1-ave2)/sqrt(var1/n1+var2/n2)
df = (var1/n1+var2/n2)**2/((var1/n1)**2/(n1-1)+(var2/n2)**2/(n2-1))
df = 0.7 * df
prob = nr.betai(0.5*df,0.5,df/(df+t**2))
return t, prob
def med_func(x, y, sig, b):
small = 1.0e-8
aa = median(y - b*x)
d = (y - aa - b*x)
mad = median(N.absolute(d))
s = mad / 0.6745
d /= sig
sign = N.compress(N.absolute(d) > small, d)
sign = sign / N.absolute(sign)
x = N.compress(N.absolute(d) > small, x)
sum = N.sum(sign * x)
return sum, s, aa
def biwt_func(x, y, sig, b): # Problems?!?
aa = median(y - b*x)
d = (y - aa - b*x)
mad = median(N.absolute(d))
s = mad / 0.6745
d /= sig
# biweight
c = 6.0
f = d*(1-d**2/c**2)**2
sum = N.sum(N.compress(N.absolute(d) <= c, x*f))
# lorentzian
#f = d/(1+0.5*d**2)
#sum = N.sum(x*f)
# MAD
#small = 1.0e-8
#sign = N.compress(N.absolute(d) > small, d)
#sign = sign / N.absolute(sign)
#sum = N.sum(N.compress(N.absolute(d) > small, x)*sign)
return sum, s, aa
def fit(x, y, sig=1.0, func=med_func):
aa, bb, siga, sigb, chi2 = nr.fit(x, y)
b1 = bb
f1, sigma, aa = func(x, y, sig, b1)
if abs(f1) > 1.0e-8:
b2 = bb + 3.0*sigb * f1/abs(f1)
else:
b2 = bb + 3.0*sigb
f2, sigma, aa = func(x, y, sig, b2)
#print 'Initial:', aa, b1, b2, f1, f2
while f1*f2 > 0.0:
bb = 2.0*b2-b1
b1 = b2
f1 = f2
b2 = bb
f2, sigma, aa = func(x, y, sig, b2)
#print 'Bracketing:', aa, b1, b2, f1, f2
sigb = 0.01*sigb
while abs(b2-b1) > sigb:
bb = 0.5 * (b1+b2)
if (bb == b1 or bb == b2): break
f, sigma, aa = func(x, y, sig, bb)
if f*f1 >= 0.0:
f1 = f
b1 = bb
else:
f2 = f
b2 = bb
#print 'Bisecting:', aa, b1, b2, f1, f2
return aa, bb, sigma
def fit_intercept(x, y, b):
sum, s, a = med_func(x, y, 1.0, b)
return a, s
def ks_test(x, y):
x = N.sort(x)
y = N.sort(y)
nx = len(x)
ny = len(y)
jx = jy = 0
fnx = fny = 0.0
fnx_arr = N.zeros(nx, N.Float32)
fny_arr = N.zeros(ny, N.Float32)
binx = N.zeros(nx, N.Float32)
biny = N.zeros(ny, N.Float32)
d = 0.0
while (jx < nx or jy < ny):
if jx < nx:
dx = x[jx]
else:
dx = x[-1]
if jy < ny:
dy = y[jy]
else:
dy = y[-1]
if (dx <= dy or jy >= ny-1) and jx < nx:
fnx = (jx+1)/float(nx)
binx[jx] = dx
fnx_arr[jx] = fnx
jx += 1
if (dy <= dx or jx >= nx-1) and jy < ny:
fny = (jy+1)/float(ny)
biny[jy] = dy
fny_arr[jy] = fny
jy += 1
dt = abs(fny-fnx)
if dt > d:
d = dt
#print '%5.3f %5.3f %5.3f %5.3f %5.3f %5.3f'%(dx, dy, fnx, fny, dt, d)
n = sqrt(nx*ny/(nx+ny))
prob = probks((n+0.12+0.11/n)*d)
return d, prob, [binx, fnx_arr], [biny, fny_arr]
def probks(alam):
EPS1 = 0.001
EPS2 = 1.0e-8
fac=2.0
sum=0.0
termbf=0.0
a2 = -2.0*alam*alam
for j in range(1, 100):
term = fac*exp(a2*j**2)
sum += term
if (abs(term) <= EPS1*termbf or abs(term) <= EPS2*sum):
return sum
fac = -fac
termbf=abs(term)
print 'Warning: KS-test probability function did not converge'
return 1.0
def kuiper_test(x, y):
x = N.sort(x)
y = N.sort(y)
nx = len(x)
ny = len(y)
jx = jy = 0
fnx = fny = 0.0
fnx_arr = N.zeros(nx, N.Float32)
fny_arr = N.zeros(ny, N.Float32)
binx = N.zeros(nx, N.Float32)
biny = N.zeros(ny, N.Float32)
d1 = d2 = 0.0
while (jx < nx or jy < ny):
if jx < nx:
dx = x[jx]
else:
dx = x[-1]
if jy < ny:
dy = y[jy]
else:
dy = y[-1]
if (dx <= dy or jy >= ny-1) and jx < nx:
fnx = (jx+1)/float(nx)
binx[jx] = dx
fnx_arr[jx] = fnx
jx += 1
if (dy <= dx or jx >= nx-1) and jy < ny:
fny = (jy+1)/float(ny)
biny[jy] = dy
fny_arr[jy] = fny
jy += 1
dt1 = (fny-fnx)
dt2 = (fnx-fny)
d1 = max(d1, dt1)
d2 = max(d2, dt2)
v = d1 + d2
n = sqrt(nx*ny/(nx+ny))
prob = probkuiper((n+0.155+0.24/n)*v)
return v, prob, [binx, fnx_arr], [biny, fny_arr]
def probkuiper(alam):
if alam < 0.4:
return 1.0
else:
EPS1 = 0.001
EPS2 = 1.0e-8
sum=0.0
termbf=0.0
alam2 = alam**2
for j in range(1, 100):
j2 = j**2
term = 2.0 * (4.0*j2*alam2 - 1.0) * exp(-2.0*j2*alam2)
sum += term
if (abs(term) <= EPS1*termbf or abs(term) <= EPS2*sum):
return sum
termbf=abs(term)
print 'Warning: Kuiper test probability function did not converge'
return 1.0