-
Notifications
You must be signed in to change notification settings - Fork 0
/
replicate.py
391 lines (332 loc) · 11.9 KB
/
replicate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
# encoding: utf-8
#
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http:# mozilla.org/MPL/2.0/.
#
# Author: Kyle Lahnakoski (kyle@lahnakoski.com)
#
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from datetime import timedelta, datetime
from mo_dots import wrap, unwraplist, literal_field
from mo_files import File
from mo_logs import startup, constants, Log
from mo_math import Math, MAX
from mo_threads import Queue, Thread, Signal, THREAD_STOP
from mo_times import Date
from mo_times.timer import Timer
from mo_hg.hg_mozilla_org import HgMozillaOrg
from pyLibrary.env import elasticsearch, http
from pyLibrary.queries import jx
# REPLICATION
#
# Replication has a few benefits:
# 1) The replicate can have scripting enabled, allowing more powerful set of queries
# 2) Physical proximity increases latency
# 3) The replicate can be configured with better hardware
# 4) The replicate's exclusivity increases availability (Mozilla's public cluster may have time of high load)
far_back = datetime.utcnow() - timedelta(weeks=52)
BATCH_SIZE = 1000
http.ZIP_REQUEST = False
hg = None
config = None
def get_last_updated(es):
try:
results_max = es.search({
"query": {"match_all": {}},
"from": 0,
"size": 1,
"sort": {config.primary_field: "desc"}
})
max_ = results_max.hits.hits[0]._source[config.primary_field]
if isinstance(max_, unicode):
pass
elif Math.is_integer(max_):
max_ = int(max_)
return max_
except Exception, e:
Log.warning("Can not get_last_updated from {{host}}/{{index}}", {
"host": es.settings.host,
"index": es.settings.index
}, e)
return None
def get_pending(source, since, pending_bugs, please_stop):
try:
while not please_stop:
if since == None:
Log.note("Get all records")
result = source.search({
# "query": {"match_all": {}},
"query": {
"filtered": {
"filter": {"exists": {"field": config.primary_field}},
"query": {"match_all": {}}
}},
"fields": ["_id", config.primary_field],
"from": 0,
"size": BATCH_SIZE,
"sort": [config.primary_field]
})
else:
Log.note(
"Get records with {{primary_field}} >= {{max_time|datetime}}",
primary_field=config.primary_field,
max_time=since
)
result = source.search({
"query": {"filtered": {
"query": {"match_all": {}},
"filter": {"range": {config.primary_field: {"gte": since}}},
}},
"fields": ["_id", config.primary_field],
"from": 0,
"size": BATCH_SIZE,
"sort": [config.primary_field]
})
new_max_value = MAX([unwraplist(h.fields[literal_field(config.primary_field)]) for h in result.hits.hits])
if since == new_max_value:
# GET ALL WITH THIS TIMESTAMP
result = source.search({
"query": {"filtered": {
"query": {"match_all": {}},
"filter": {"term": {config.primary_field: since}},
}},
"fields": ["_id", config.primary_field],
"from": 0,
"size": 100000
})
if Math.is_integer(new_max_value):
since = int(new_max_value) + 1
elif Math.is_number(new_max_value):
since = float(new_max_value) + 0.5
else:
since = unicode(new_max_value) + "a"
else:
since = new_max_value
ids = result.hits.hits._id
Log.note("Adding {{num}} to pending queue", num=len(ids))
pending_bugs.extend(ids)
if len(result.hits.hits) < BATCH_SIZE:
break
Log.note("No more ids")
except Exception, e:
please_stop.go()
Log.error("Problem while copying records", cause=e)
def diff(source, destination, pending, please_stop):
"""
SEARCH FOR HOLES IN DESTINATION
:param source:
:param destination:
:param pending: QUEUE TO FILL WITH MISSING RECORDS
:return:
"""
if config.diff == False:
return
# FIND source MIN/MAX
results_max = source.search({
"query": {"match_all": {}},
"from": 0,
"size": 1,
"sort": {config.primary_field: "desc"}
})
results_min = source.search({
"query": {"match_all": {}},
"from": 0,
"size": 1,
"sort": {config.primary_field: "asc"}
})
if results_max.hits.total == 0:
return
_min = results_min.hits.hits[0]._source[config.primary_field]
_max = results_max.hits.hits[0]._source[config.primary_field]
def _copy(min_, max_):
try:
if please_stop:
Log.note("Scanning was aborted")
return
source_result = source.search({
"query": {"filtered": {
"query": {"match_all": {}},
"filter": {"range": {config.primary_field: {"gte": min_, "lt": max_}}}
}},
"fields": ["_id"],
"from": 0,
"size": 200000
})
source_ids = set(source_result.hits.hits._id)
destination_result = destination.search({
"query": {"filtered": {
"query": {"match_all": {}},
"filter": {"range": {config.primary_field: {"gte": min_, "lt": max_}}}
}},
"fields": ["_id"],
"from": 0,
"size": 200000
})
destination_ids = set(destination_result.hits.hits._id)
missing = source_ids - destination_ids
Log.note(
"Scan from {{min}} to {{max}}: source={{source}}, dest={{dest}}, diff={{diff}}",
min=min_,
max=max_,
source=len(source_ids),
dest=len(destination_ids),
diff=len(missing)
)
if missing:
pending.extend(missing)
except Exception, e:
if min_ + 1 == max_:
Log.warning("Scanning had a with field {{value||quote}} problem", value=min_, cause=e)
else:
mid_ = Math.round((min_+max_)/2, decimal=0)
_copy(min_, mid_)
_copy(mid_, max_)
num_mismatches = [0] # TRACK NUMBER OF MISMATCHES DURING REPLICATION
def _partition(min_, max_):
try:
source_count = source.search({
"query": {"filtered": {
"query": {"match_all": {}},
"filter": {"range": {config.primary_field: {"gte": min_, "lt": max_}}}
}},
"size": 0
})
if num_mismatches[0] < 10:
# SOMETIMES THE TWO ARE TOO DIFFERENT TO BE OPTIMISTIC
dest_count = destination.search({
"query": {"filtered": {
"query": {"match_all": {}},
"filter": {"range": {config.primary_field: {"gte": min_, "lt": max_}}}
}},
"size": 0
})
if source_count.hits.total == dest_count.hits.total:
return
elif source_count.hits.total < 200000:
num_mismatches[0] += 1
if source_count.hits.total < 200000:
_copy(min_, max_)
elif Math.is_number(min_) and Math.is_number(max_):
mid_ = int(round((float(min_) + float(max_)) / 2, 0))
# WORK BACKWARDS
_partition(mid_, max_)
_partition(min_, mid_)
else:
Log.error("can not split alphabetical in half")
except Exception, e:
Log.error("Scanning had a problem", cause=e)
try:
_partition(_min, _max)
finally:
Log.note("Done scanning for holes")
def replicate(source, destination, pending_ids, fixes, please_stop):
"""
COPY source RECORDS TO destination
"""
def fixer(_source):
for k, v in _source.items():
locals()[k] = v
for k, f in fixes.items():
try:
_source[k] = eval(f)
except Exception, e:
if "Problem pulling pushlog" in e:
pass
elif "can not find branch" in e:
pass
else:
Log.warning("not evaluated {{expression}}", expression=f, cause=e)
return _source
for g, docs in jx.groupby(pending_ids, max_size=BATCH_SIZE):
with Timer("Replicate {{num_docs}} documents", {"num_docs": len(docs)}):
data = source.search({
"query": {"filtered": {
"query": {"match_all": {}},
"filter": {"terms": {"_id": set(docs)}}
}},
"from": 0,
"size": 200000,
"sort": []
})
destination.extend([{"id": h._id, "value": fixer(h._source)} for h in data.hits.hits])
if please_stop:
break
Log.note("Done replication")
def main():
global BATCH_SIZE
current_time = Date.now()
time_file = File(config.last_replication_time)
# SYNCH WITH source ES INDEX
source = elasticsearch.Index(config.source)
destination = elasticsearch.Cluster(config.destination).get_or_create_index(config.destination)
# GET LAST UPDATED
if config.since != None:
last_updated = Date(config.since).unix
else:
last_updated = get_last_updated(destination)
if config.batch_size:
BATCH_SIZE = config.batch_size
Log.note("updating records with {{primary_field}}>={{last_updated}}", last_updated=last_updated,
primary_field=config.primary_field)
please_stop = Signal()
done = Signal()
pending = Queue("pending ids", max=BATCH_SIZE*3, silent=False)
pending_thread = Thread.run(
"get pending",
get_pending,
source=source,
since=last_updated,
pending_bugs=pending,
please_stop=please_stop
)
diff_thread = Thread.run(
"diff",
diff,
source,
destination,
pending,
please_stop=please_stop
)
replication_thread = Thread.run(
"replication",
replicate,
source,
destination,
pending,
config.fix,
please_stop=please_stop
)
pending_thread.join()
diff_thread.join()
pending.add(THREAD_STOP)
try:
replication_thread.join()
except Exception, e:
Log.warning("Replication thread failed", cause=e)
done.go()
please_stop.go()
Log.note("done all")
# RECORD LAST UPDATED, IF WE DID NOT CANCEL OUT
time_file.write(unicode(current_time.milli))
def start():
global hg
global config
_ = wrap
try:
config = startup.read_settings()
with startup.SingleInstance(config.args.filename):
constants.set(config.constants)
Log.start(config.debug)
if config.hg:
hg = HgMozillaOrg(config.hg)
main()
except Exception, e:
Log.warning("Problems exist", e)
finally:
Log.stop()
if __name__ == "__main__":
start()