/
dnc_correlations.py
167 lines (155 loc) · 5.32 KB
/
dnc_correlations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import copy
import itertools
import numpy as np
import gvar as gv
import manipulate_dataset as md
from multiprocessing import Pool
#pi0:~simone/python/minFit/Zvqq/new-Zvqq.py
# # intersection of config id
# self.commonConfigId = set(configId[0]).intersection(*configId[1:]) # intersection set of configId
# self.all = list()
# for cl in configId:
# al = list()
# for id, j in zip(cl,range(len(cl))):
# if id in self.commonConfigId:
# al.append(j)
# pass
# pass
# self.all.append(al)
# pass
# print 'count configs in db vs the intersection'
# for j in range(len(configId)):
# print 'npt:', j, 'db:', len(configId[j]), 'all:', len(self.all[j])
# pass
# self.nconf = len(self.all[0])
# return
def get_intersect(dset,pfix1,pfix2):
## -- computes the list of tags which contains the common configurations between the datasets
## returns the common list as a set
klst1 = []
klst2 = []
for key in dset:
skey = key.split('_')
if pfix1 == skey[0]:
klst1.append('_'.join(skey[1:]))
if pfix2 == skey[0]:
klst2.append('_'.join(skey[1:]))
return set(klst1).intersection(klst2)
def compute_correlation_pair(dset,pfix1,pfix2):
## -- compute correlations for data, only using common configurations
## a pair of prefixes is used, output averaged dataset is returned
isect = get_intersect(dset,pfix1,pfix2)
tdat0 = gv.dataset.Dataset()
print "starting correlations ",pfix1,pfix2
for key in dset:
skey = key.split('_')
if skey[0] == pfix1:
tdat0[key] = md.average_tag_fn(dset[key]) ## keep suffix
#print key
#print tdat0[key]
if pfix1 != pfix2:
for key in dset:
skey = key.split('_')
if skey[0] == pfix2:
tdat0[key] = md.average_tag_fn(dset[key])
tdat1 = gv.dataset.Dataset(keys=[pfix1,pfix2])
if pfix1 == pfix2:
for key in isect:
tdat1.append(pfix1,tdat0[pfix1+'_'+key][0]) # arrays of len 1
else:
for key in isect:
tdat1.append(pfix1,tdat0[pfix1+'_'+key][0])
tdat1.append(pfix2,tdat0[pfix2+'_'+key][0])
#print tdat1.keys()
#print len(tdat1[pfix1])
#raise ValueError("test")
#print "starting avg_data"
#testout = gv.dataset.avg_data(tdat1)
#print "ending avg_data"
#return testout
return gv.dataset.avg_data(tdat1)
#tdat1[pfix1] = []
#tdat1[pfix2] = []
#for key in isect:
# tdat1[pfix1].append(tdat0[pfix1+'_'+key])
#if pfix1 != pfix2:
# for key in isect:
# tdat1[pfix2].append(tdat0[pfix2+'_'+key])
#return gv.dataset.avg_data(tdat1)
def divide_and_conquer_correlations(dset):
## -- split up correlations into multiple small datasets and average each small set
## take correlations of small sets and stitch them together
## computations of small sets done in parallel
## assumes all keys in dset take form '<correlator key>_<series configuration>'
pflst = []
## -- create a list of prefixes
for key in dset:
skey = key.split('_')
if not(skey[0] in pflst):
pflst.append(skey[0])
rdat = gv.BufferDict()
call = {}
## -- compute diagonals first to construct full dataset
for key1 in pflst:
## -- handled separately because there may be more configurations in full sample
tdat = compute_correlation_pair(dset,key1,key1)
rdat[key1] = tdat[key1]
call[key1,key1] = gv.evalcorr(tdat)[key1,key1]
for i,key1 in enumerate(pflst):
for j,key2 in enumerate(pflst):
if i <= j:
continue ## -- degenerate with i > j, i == j
tdat = compute_correlation_pair(dset,key1,key2)
cdat = gv.evalcorr(tdat)
call[key1,key2] = cdat[key1,key2]
call[key2,key1] = cdat[key2,key1]
## -- add correlations and return
rdat = gv.correlate(rdat,call)
return rdat
## map_async will only take single argument functions, so inputs must be provided as tuples
def compute_diagonal((dset,key)):
print "diagonal key ",key
tdat = compute_correlation_pair(dset,key,key)
return (key,gv.mean(tdat[key]),gv.sdev(tdat[key]),gv.evalcorr(tdat)[key,key])
def compute_offdiagonal((dset,key1,key2)):
print "off-diagonal key ",(key1,key2)
tdat = compute_correlation_pair(dset,key1,key2)
return (key1,key2,gv.evalcorr(tdat)[key1,key2])
maxProcesses=6
def divide_and_conquer_parallel(dset):
## -- split up correlations into multiple small datasets and average each small set
## take correlations of small sets and stitch them together
## computations of small sets done in parallel
## assumes all keys in dset take form '<correlator key>_<series configuration>'
pflst = []
pool = Pool(processes=maxProcesses)
## -- create a list of prefixes
for key in dset:
skey = key.split('_')
if not(skey[0] in pflst):
pflst.append(skey[0])
#if len(pflst) > 1:
# break
rdat = gv.BufferDict()
call = {}
# ## -- handled separately because there may be more configurations in full sample
print "computing diagonal"
mapout = pool.map(compute_diagonal,zip([dset for x in pflst],pflst))
print "done computing diagonal"
for val in mapout:
rdat[val[0]] = gv.gvar(val[1],val[2])
call[val[0],val[0]] = val[3]
idx = np.triu_indices(len(pflst),1)
print "computing off-diagonal"
mapout = pool.map(compute_offdiagonal,
zip([dset for x in idx[0]],[pflst[t] for t in idx[0]],[pflst[t] for t in idx[1]]))
print "done computing off-diagonal"
pool.close()
pool.join()
for val in mapout:
call[val[0],val[1]] = val[2]
call[val[1],val[0]] = np.transpose(val[2])
## -- add correlations and return
rdat = gv.correlate(rdat,call)
return rdat
#return gv.BufferDict()