forked from j2payton/cis4307-2020-raft-project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
raftnode.py
649 lines (516 loc) · 31.8 KB
/
raftnode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
import logging
import os
import random
import threading
import time
import traceback
import math
import rpyc
from rpyc.utils import helpers
from rpyc.core import AsyncResultTimeout
import socket
import sys
class NodeRef:
def __init__(self, name, hostAddr, port):
self.name = name
self.host = hostAddr
self.port = port
'''
A RAFT RPC server class.
Please keep the signature of the is_leader() method unchanged (though
implement the body of that function correctly. You will need to add
other methods to implement ONLY the leader election part of the RAFT
protocol.
'''
class RaftNode(rpyc.Service):
# used to calculate the seconds to wait before calling an election,
# based on typical round-trip communication delay for contacting 1 active node
ELECTION_TIMEOUT_BASELINE = 0.3
# how long to wait when making a connection to another node before giving up and assuming the other node is down
# also used to limit the delay if an RPC somehow got stuck partway through, but that's highly unlikely
# since RPC execution time (on the other node) is 2-5 ms
CONNECTION_TIMEOUT = 0.35
NODE_STATE_FOLDER = "node_states"
NODE_LOGS_FOLDER = "node_logs"
BACKUP_SEPARATOR = ":"
TERM_BACKUP_KEY = "term"
VOTE_BACKUP_KEY = "vote"
CURR_LEADER_BACKUP_KEY = "currLeader"
"""
Initialize the class using the config file provided and also initialize
any datastructures you may need.
"""
def __init__(self, configFilePath, nodeIdentityIndex):
self.identityIndex = nodeIdentityIndex
self.isCandidate = False
self.currTerm = 0
self.voteTarget = None # who the node is voting for in the current term
self.currLeader = None # who has been elected leader in the current term
# should these not be reentrant?
self.stateFileLock = threading.RLock()
self.stateLock = threading.RLock()
self.nodeStateFilePath = os.path.join(RaftNode.NODE_STATE_FOLDER, "node" + str(self.identityIndex) + ".txt")
# set up logging
nodeName = "raftNode" + str(nodeIdentityIndex)
self.nodeLogger = logging.getLogger(nodeName)
self.nodeLogger.setLevel(logging.DEBUG)
if not os.path.exists(RaftNode.NODE_LOGS_FOLDER):
os.makedirs(RaftNode.NODE_LOGS_FOLDER)
logFilePath = os.path.join(RaftNode.NODE_LOGS_FOLDER, nodeName + ".log")
logFileHandler = logging.FileHandler(logFilePath)
logFileHandler.setLevel(logging.DEBUG)
consoleHandler = logging.StreamHandler()
consoleHandler.setLevel(logging.WARN)
formatter = logging.Formatter('%(asctime)s - %(threadName)s - %(levelname)s - %(message)s')
logFileHandler.setFormatter(formatter)
consoleHandler.setFormatter(formatter)
self.nodeLogger.addHandler(logFileHandler)
self.nodeLogger.addHandler(consoleHandler)
if os.path.exists(self.nodeStateFilePath):
with open(self.nodeStateFilePath, mode="r") as nodeBackup:
nodeStateBackup = self._load_node_backup(nodeBackup)
storedTermStr = nodeStateBackup.get(RaftNode.TERM_BACKUP_KEY)
if storedTermStr is not None:
storedTermVal = int(storedTermStr)
self.currTerm = storedTermVal
storedVoteStr = nodeStateBackup.get(RaftNode.VOTE_BACKUP_KEY)
if storedVoteStr is not None:
storedVoteVal = int(storedVoteStr)
if storedVoteVal >= 0:
self.voteTarget = storedVoteVal
storedCurrLeaderStr = nodeStateBackup.get(RaftNode.CURR_LEADER_BACKUP_KEY)
if storedCurrLeaderStr is not None:
storedCurrLeaderVal = int(storedCurrLeaderStr)
if storedCurrLeaderVal >= 0:
self.currLeader = storedCurrLeaderVal
self.nodeLogger.info("loading backup of prior node state from disk:\n"
"term %d; voteTarget %d (-1 standing for None); "
"current leader %d (-1 standing for None)", self.currTerm,
self.voteTarget or -1, self.currLeader or -1)
self.otherNodes = []
with open(configFilePath) as nodesConfigFile:
nodesConfigFile.readline() # ignore first line with node count
for nodeInd, nodeLine in enumerate(nodesConfigFile):
if nodeInd != nodeIdentityIndex:
otherNodeTerms = nodeLine.split(":")
otherNodeName = otherNodeTerms[0].strip()
otherNodeHost = otherNodeTerms[1].strip()
otherNodePort = otherNodeTerms[2].strip()
otherNodePort = int(otherNodePort)
otherNode = NodeRef(otherNodeName, otherNodeHost, otherNodePort)
self.otherNodes.append(otherNode)
numOtherNodes = len(self.otherNodes)
numNodes = 1+ numOtherNodes
#subtracting 1 because this node already provides itself with 1 vote when it's a candidate
if numNodes % 2 == 0:
numVotesNeeded = numNodes/2 + 1 -1
else:
numVotesNeeded = math.ceil(numNodes/2) -1
# based on worst-case where only a bare majority of nodes are still alive & one or more of those live nodes
# is after all of the dead ones in the list
minimumElectionTimeout = (numOtherNodes-numVotesNeeded)*RaftNode.CONNECTION_TIMEOUT + \
RaftNode.ELECTION_TIMEOUT_BASELINE*numVotesNeeded
#todo try 0.5-1.5 rather than 1-2 or 0.75-1.75
self.electionTimeout = (1 + random.random())*minimumElectionTimeout
self._restart_timer()
self.heartbeatInterval = 0.5*minimumElectionTimeout
self.nodeLogger.critical("I am node %d (election timeout %f) and I just finished being constructed, with %d fellow nodes",
self.identityIndex, self.electionTimeout, len(self.otherNodes))
for otherNodeDesc in self.otherNodes:
self.nodeLogger.debug("other node %s is at host %s and port %d", otherNodeDesc.name, otherNodeDesc.host,
otherNodeDesc.port)
def _save_node_state(self):
'''writes node state to disk'''
# saveStartTime = time.time()
if not os.path.exists(RaftNode.NODE_STATE_FOLDER):
os.makedirs(RaftNode.NODE_STATE_FOLDER)
self.stateFileLock.acquire()
with open(self.nodeStateFilePath, mode="w") as nodeStateStorageFile:
termLine = RaftNode.TERM_BACKUP_KEY + RaftNode.BACKUP_SEPARATOR + str(self.currTerm) + "\n"
nodeStateStorageFile.write(termLine)
voteTargetIndex = self.voteTarget if self.voteTarget is not None else -1
voteLine = RaftNode.VOTE_BACKUP_KEY + RaftNode.BACKUP_SEPARATOR + str(voteTargetIndex) + "\n"
nodeStateStorageFile.write(voteLine)
currLeaderIndex = self.currLeader or -1
currLeaderLine = RaftNode.CURR_LEADER_BACKUP_KEY + RaftNode.BACKUP_SEPARATOR + str(currLeaderIndex) + "\n"
nodeStateStorageFile.write(currLeaderLine)
nodeStateStorageFile.flush()
os.fsync(nodeStateStorageFile.fileno())
self.stateFileLock.release()
# saveDuration = time.time() - saveStartTime
# self.nodeLogger.debug("saving node state took %f seconds", saveDuration)
def _load_node_backup(self, backupFile):
'''reads a list of key-value pairs from the file containing a backup of the node's state
:return: that list of key-value pairs of state information'''
backupDict = {}
for backupLine in backupFile:
if backupLine != "":
lineTokens = backupLine.split(RaftNode.BACKUP_SEPARATOR)
if len(lineTokens) == 2:
currKey = lineTokens[0].strip()
currVal = lineTokens[1].strip()
backupDict[currKey] = currVal
else:
self.nodeLogger.error("malformed line in node backup file: %s", backupLine)
return backupDict
def _restart_timer(self):
'''resets the election timer'''
self.lastContactTimestamp = time.time()
# electionTimerStartupStartTime = time.time()
electionTimer = threading.Timer(self.electionTimeout, self.check_for_election_timeout)
electionTimer.start()
# electionTimerStartupDuration = time.time() - electionTimerStartupStartTime
# self.nodeLogger.debug("starting up an election timer took %f seconds", electionTimerStartupDuration)
def exposed_is_leader(self):
'''returns whether this node is a leader
Meant to be called as RPC
:return boolean: whether this node is a leader
'''
return self.currLeader == self.identityIndex
def exposed_append_entries(self, leaderTerm, leaderIndex):
'''tries to reset this node's election timer on behalf of an RPC-caller node that thinks it's the current leader
Meant to be called as RPC
:param leaderTerm int: the term which the caller thinks is most recent
:param leaderIndex int: the index of the node which thinks it's the leader & is sending a heartbeat to this node
:return tuple(int, boolean): what term this node thinks is most recent and whether this node is recognizing
the caller of the RPC as the leader
'''
willAppendEntries = False
appendEntriesStartTime = time.time()
self.nodeLogger.debug("about to acquire LOCK to execute append_entries RPC for leader node %d "
"which was in term %d", leaderIndex, leaderTerm)
self.stateLock.acquire()
self.nodeLogger.debug("successfully acquired LOCK to execute append_entries RPC for leader node %d "
"which was in term %d", leaderIndex, leaderTerm)
termAtStartOfAppendEntries = self.currTerm
if leaderTerm < self.currTerm:
self.nodeLogger.info(
"while in term %d, received append_entries() from stale leader %d which thought it was in term %d",
self.currTerm, leaderIndex, leaderTerm)
else:
self.nodeLogger.debug(
"while in term %d, executing append_entries on behalf of node %d, the leader in term %d",
self.currTerm, leaderIndex, leaderTerm)
self._restart_timer()
if leaderTerm > self.currTerm:
if self.voteTarget is not None:
self.nodeLogger.warning("was in election for term %d, voting for candidate node %d, "
"when received request to append entries in later term %d", self.currTerm,
self.voteTarget, leaderTerm)
self.voteTarget = None
self.nodeLogger.critical("was in term %d with candidate status %s and current leader index %d when "
"received heartbeat from leader node %d in higher term %d", self.currTerm,
self.isCandidate, self.currLeader or -1, leaderIndex, leaderTerm)
self.isCandidate = False
self.currLeader = None
self.currTerm = leaderTerm
if self.currLeader != leaderIndex:
self.nodeLogger.critical("acknowledging node %d as the leader for term %d", leaderIndex, self.currTerm)
self.voteTarget = None
self.currLeader = leaderIndex
self._save_node_state()
willAppendEntries = True
self.nodeLogger.debug("releasing LOCK after executing append_entries RPC for leader node %d "
"which was in term %d", leaderIndex, leaderTerm)
self.stateLock.release()
appendEntriesDuration = time.time() - appendEntriesStartTime
self.nodeLogger.debug("while starting in term %d, executing append_entries for leader node %d "
"which was in term %d took %f seconds", termAtStartOfAppendEntries, leaderIndex,
leaderTerm, appendEntriesDuration)
return (self.currTerm, willAppendEntries)
def call_append_entries(self, otherNodeDesc):
'''send an append_entries/heartbeat 'message' to another node by calling that RPC on that node
:param otherNodeDesc NodeRef: a description of the other node
:return tuple(int, boolean): what term the other node thinks is most recent and whether the other node accepts
this node as the leader
'''
# assert self.exposed_is_leader() this might not always be true because of concurrency
appendEntriesRetVal = None
heartbeatRpcStartTime = time.time()
try:
nodeConnStream = rpyc.SocketStream.connect(otherNodeDesc.host, otherNodeDesc.port,
timeout= RaftNode.CONNECTION_TIMEOUT, attempts= 1)
nodeConn = rpyc.connect_stream(nodeConnStream)
otherNodeRoot = nodeConn.root
timedAppendEntriesProxy = helpers.timed(otherNodeRoot.append_entries, RaftNode.CONNECTION_TIMEOUT)
appendEntriesPromise = timedAppendEntriesProxy(self.currTerm, self.identityIndex)
appendEntriesRetVal = appendEntriesPromise.value
except AsyncResultTimeout:
self.nodeLogger.info("connection timed out while leader node %d in term %d tried to send append_entries "
"to node %s", self.identityIndex, self.currTerm, otherNodeDesc.name)
except (socket.timeout, ConnectionRefusedError):
self.nodeLogger.info("leader node %d in term %d was unable to connect to another node %s",
self.identityIndex, self.currTerm, otherNodeDesc.name)
except EOFError:
self.nodeLogger.info("leader node %d in term %d lost connection to another node %s",
self.identityIndex, self.currTerm, otherNodeDesc.name)
except Exception as e:
self.nodeLogger.error("Exception for leader node %d in term %d: %s\n%s\n%s",
self.identityIndex, self.currTerm, e.__doc__, str(e), traceback.format_exc())
heartbeatRpcDuration = time.time() - heartbeatRpcStartTime
self.nodeLogger.debug("sending append_entries to other node %s took %f seconds", otherNodeDesc.name,
heartbeatRpcDuration)
return appendEntriesRetVal
def exposed_request_vote(self, candidateTerm, candidateIndex):
'''tries to get this node's vote in an election on behalf of an RPC-caller node which is a candidate in that election
Meant to be called as RPC
:param candidateTerm: the term which the caller/candidate node thinks is most recent & which its election is in
:param candidateIndex: the index of that caller/candidate node
:return tuple(int, boolean): what term this node thinks is most recent and whether this node is voting for the
caller candidate node
'''
willVote = False
voteRequestStartTime = time.time()
self.nodeLogger.debug("about to acquire LOCK to execute request_vote RPC for candidate node %d "
"which was in term %d", candidateIndex, candidateTerm)
self.stateLock.acquire()
self.nodeLogger.debug("successfully acquired LOCK to execute request_vote RPC for candidate node %d "
"which was in term %d", candidateIndex, candidateTerm)
termAtStartOfVoteRequest = self.currTerm
if candidateTerm < self.currTerm:
self.nodeLogger.info("while in term %d, received request_vote() from stale candidate %d "
"which thought it was in term %d", self.currTerm, candidateIndex, candidateTerm)
else:
self.nodeLogger.debug(
"while in term %d, executing request_vote on behalf of node %d, a candidate in term %d",
self.currTerm, candidateIndex, candidateTerm)
if candidateTerm > self.currTerm:
if self.voteTarget is not None:
self.nodeLogger.warning("was in election for term %d, voting for candidate node %d, "
"when received request for vote in later term %d", self.currTerm,
self.voteTarget, candidateTerm)
self.voteTarget = None
self.nodeLogger.critical("was in term %d with candidate status %s and current leader index %d when "
"received request for vote in higher term %d", self.currTerm, self.isCandidate,
self.currLeader or -1, candidateTerm)
self.isCandidate = False
self.currLeader = None
self.currTerm = candidateTerm
self._save_node_state()
self._restart_timer()
else:
if self.exposed_is_leader():
self.nodeLogger.warning("elected leader %d received request_vote() from candidate %d "
"when both are in term %d", self.identityIndex, candidateIndex,
self.currTerm)
elif self.isCandidate:
self.nodeLogger.warning("candidate node %d received request_vote() from other candidate %d "
"when both are in term %d", self.identityIndex, candidateIndex,
self.currTerm)
elif self.currLeader is not None:
self.nodeLogger.warning("follower node %d received request_vote() from candidate node %d when both "
"are in term %d but another node %d has already been elected leader",
self.identityIndex, candidateIndex, self.currTerm, self.currLeader)
if not self.isCandidate and self.currLeader is None and self.voteTarget is None:
self.nodeLogger.critical("casting vote for candidate node %d in term %d", candidateIndex, self.currTerm)
self._restart_timer()
self.voteTarget = candidateIndex
self._save_node_state()
willVote = True
self.nodeLogger.debug("releasing LOCK after executing request_vote RPC for candidate node %d "
"which was in term %d", candidateIndex, candidateTerm)
self.stateLock.release()
voteRequestDuration = time.time() - voteRequestStartTime
self.nodeLogger.debug("while starting in term %d, executing request_vote for candidate node %d which was in term %d "
"took %f seconds", termAtStartOfVoteRequest, candidateIndex, candidateTerm, voteRequestDuration)
return (self.currTerm, willVote)
def call_request_vote(self, otherNodeDesc):
'''sends a vote request message to some other node by calling that RPC on that node
:param otherNodeDesc NodeRef: a description of the other node
:return tuple(int, boolean): what term the other node thinks is most recent and
whether the other node will vote for this one
'''
assert self.isCandidate
requestVoteRetVal = None
voteRequestRpcStartTime = time.time()
try:
nodeConnStream = rpyc.SocketStream.connect(otherNodeDesc.host, otherNodeDesc.port,
timeout=RaftNode.CONNECTION_TIMEOUT, attempts=1)
nodeConn = rpyc.connect_stream(nodeConnStream)
otherNodeRoot = nodeConn.root
timedRequestVoteProxy = helpers.timed(otherNodeRoot.request_vote, RaftNode.CONNECTION_TIMEOUT)
voteRequestPromise = timedRequestVoteProxy(self.currTerm, self.identityIndex)
requestVoteRetVal = voteRequestPromise.value
except AsyncResultTimeout:
self.nodeLogger.info("connection timed out while candidate node %d in term %d tried to send request_vote "
"to node %s", self.identityIndex, self.currTerm, otherNodeDesc.name)
except (socket.timeout, ConnectionRefusedError):
self.nodeLogger.info("candidate node %d in term %d was unable to connect to another node %s",
self.identityIndex, self.currTerm, otherNodeDesc.name)
except EOFError:
self.nodeLogger.info("candidate node %d in term %d lost connection to another node %s",
self.identityIndex, self.currTerm, otherNodeDesc.name)
except Exception as e:
self.nodeLogger.error("Exception for candidate node %d in term %d: %s\n%s\n%s",
self.identityIndex, self.currTerm, e.__doc__, str(e), traceback.format_exc())
voteRequestRpcDuration = time.time() - voteRequestRpcStartTime
self.nodeLogger.debug("sending vote request to node %s took %f seconds", otherNodeDesc.name,
voteRequestRpcDuration)
return requestVoteRetVal
def check_for_election_timeout(self):
'''checks whether the election timer has actually expired and if so starts an election in a new term'''
self.nodeLogger.debug("about to acquire LOCK to check for election timeout")
self.stateLock.acquire()
self.nodeLogger.debug("successfully acquired LOCK to check for election timeout")
if self.exposed_is_leader():
self.nodeLogger.info("this node is ignoring an election timeout because it's the leader and so releases the LOCK")
self.stateLock.release()
else:
self.nodeLogger.debug("checking whether election should be started")
if (time.time() - self.lastContactTimestamp) > self.electionTimeout:
self.isCandidate = True
self.currLeader = None
self.voteTarget = self.identityIndex
self.currTerm += 1
self._save_node_state()
self._restart_timer()
self.nodeLogger.critical("starting election for the new term %d", self.currTerm)
electionTerm = self.currTerm
numVotes = 1
numNodes = 1 + len(self.otherNodes)
nodesToContact = self.otherNodes.copy()
self.nodeLogger.debug("about to contact the %d other nodes", len(nodesToContact))
self.nodeLogger.debug("releasing the LOCK after starting election for term %d", self.currTerm)
self.stateLock.release()
while len(nodesToContact) > 0:
currOtherNode = nodesToContact.pop(0)
self.nodeLogger.debug("about to acquire LOCK to send vote request to node %s", currOtherNode.name)
self.stateLock.acquire()
self.nodeLogger.debug("successfully acquired LOCK to send vote request to node %s", currOtherNode.name)
if not self.isCandidate or self.currLeader is not None or electionTerm != self.currTerm:
self.nodeLogger.debug("releasing LOCK (before contacting node %s) as part of terminating the "
"election which was running for term %d", currOtherNode.name, electionTerm)
self.stateLock.release()
break
self.nodeLogger.debug("releasing LOCK just before requesting vote from node %s", currOtherNode.name)
self.stateLock.release()
self.nodeLogger.debug("sending vote request to node %s, with %d more nodes "
"to be contacted afterwards", currOtherNode.name, len(nodesToContact))
nodeVoteResponse = self.call_request_vote(currOtherNode)
self.nodeLogger.debug("acquiring LOCK in order to process results of requesting vote from node %s",
currOtherNode.name)
self.stateLock.acquire()
self.nodeLogger.debug("successfully acquired LOCK in order to process results of requesting vote from node %s",
currOtherNode.name)
if not self.isCandidate or self.currLeader is not None or electionTerm != self.currTerm:
self.nodeLogger.debug("releasing LOCK (after contacting node %s) as part of terminating the "
"election which was running for term %d", currOtherNode.name, electionTerm)
self.stateLock.release()
break
if nodeVoteResponse is None:
nodesToContact.append(currOtherNode)
elif nodeVoteResponse[1]:
numVotes += 1
self.nodeLogger.critical("received vote from other node %s in term %d", currOtherNode.name,
self.currTerm)
else:
responderTerm = nodeVoteResponse[0]
if responderTerm > self.currTerm:
self.nodeLogger.critical("terminating election for term %d because a vote request response"
" informed this node of higher term %d", self.currTerm, responderTerm)
self.isCandidate = False
self.voteTarget = None
self.currLeader = None
self.currTerm = responderTerm
self._save_node_state()
self._restart_timer()
# possible race condition with _leaderStatus?
if numVotes > numNodes / 2.0:
self.nodeLogger.critical("becoming the leader for the term %d with %d votes!!!",
self.currTerm, numVotes)
self.isCandidate = False
self.voteTarget = None
self.currLeader = self.identityIndex
self._save_node_state()
#releases lock before control flow leaves this function
self.nodeLogger.debug("releasing the LOCK after winning election for term %d", self.currTerm)
self.stateLock.release()
self.send_heartbeats()
#handles lock releasing in all cases except the one where this node just won an election
if numVotes <= numNodes / 2.0:
self.nodeLogger.debug("releasing LOCK after contacting a node %s", currOtherNode.name)
self.stateLock.release()
else:
self.nodeLogger.debug("releasing LOCK after finding that it isn't time for an election yet")
self.stateLock.release()
def send_heartbeats(self):
'''as leader, send heartbeat/append_entries messages to all other nodes
so that they don't start elections in new terms'''
self.stateLock.acquire()
if not self.exposed_is_leader():
self.nodeLogger.warning("in term %d, node attempted to send heartbeats out despite not being the leader (and releases the LOCK)",
self.currTerm)
self.stateLock.release()
else:
heartbeatTimer = threading.Timer(self.heartbeatInterval, self.send_heartbeats)
heartbeatTimer.start()
leaderTerm = self.currTerm
nodesToContact = self.otherNodes.copy()
self.nodeLogger.debug("release the LOCK just before contacting the %d other nodes", len(nodesToContact))
self.stateLock.release()
while len(nodesToContact) > 0:
currOtherNode = nodesToContact.pop(0)
self.nodeLogger.debug("acquiring the LOCK to send heartbeat to node %s", currOtherNode.name)
self.stateLock.acquire()
self.nodeLogger.debug("successfully acquired the LOCK to send heartbeat to node %s", currOtherNode.name)
if not self.exposed_is_leader():
self.nodeLogger.info("former leader (from term %d) is releasing the LOCK rather than try to send any more heartbeats (before trying to contact node %s)", leaderTerm, currOtherNode.name)
self.stateLock.release()
break
self.nodeLogger.debug("releasing LOCK just before sending heartbeat to node %s", currOtherNode.name)
self.stateLock.release()
self.nodeLogger.debug("sending heartbeat to node %s, with %d more nodes to be contacted "
"afterwards", currOtherNode.name, len(nodesToContact))
nodeHeartbeatResponse = self.call_append_entries(currOtherNode)
self.nodeLogger.debug("acquiring the LOCK to process node %s 's response to a heartbeat", currOtherNode.name)
self.stateLock.acquire()
self.nodeLogger.debug("successfully acquired the LOCK to process node %s 's response to a heartbeat", currOtherNode.name)
if not self.exposed_is_leader():
self.nodeLogger.info("former leader (from term %d) is releasing the LOCK rather than try to send any more heartbeats (after trying to contact node %s)", leaderTerm, currOtherNode.name)
self.stateLock.release()
break
if nodeHeartbeatResponse is None:
nodesToContact.append(currOtherNode)
else:
responderTerm = nodeHeartbeatResponse[0]
if responderTerm > self.currTerm:
self.nodeLogger.critical("this node was leader in term %d but is abandoning that status because "
"a heartbeat response informed it of a higher term %d",
self.currTerm, responderTerm)
self.currLeader = None
self.currTerm = responderTerm
self._restart_timer()
self.nodeLogger.debug("releasing the LOCK after sending heartbeat to node %s", currOtherNode.name)
self.stateLock.release()
if __name__ == '__main__':
from rpyc.utils.server import ThreadPoolServer
nodeNum = -1
currNodePort = -1
configFileName = sys.argv[1]
currNodeIndexStr = sys.argv[2]
currNodeIndex = int(currNodeIndexStr)
with open(configFileName) as configFile:
nodeNumLine = configFile.readline()
if nodeNumLine[:2] == "N:":
nodeNumStr = nodeNumLine[2:]
nodeNumStr = nodeNumStr.strip()
nodeNum = int(nodeNumStr)
else:
print("invalid config file- bad initial node count line: %s" % nodeNumLine)
raise Exception("bad config file")
if currNodeIndex < nodeNum:
nodeDescriptions = configFile.readlines()
if len(nodeDescriptions) == nodeNum:
currNodeLine = nodeDescriptions[currNodeIndex]
nodeTerms = currNodeLine.split(":")
nodePortStr = nodeTerms[2].strip()
currNodePort = int(nodePortStr)
else:
print("invalid config file- wrong number of lines of node descriptions %s" % nodeNumLine)
raise Exception("bad config file")
else:
print("unacceptably high index %d for node system which only has %d nodes" % (currNodeIndex, nodeNum))
raise Exception("bad node index")
if currNodePort > 0:
server = ThreadPoolServer(RaftNode(configFileName, currNodeIndex), port=currNodePort)
server.start()