def make_servermap(self, mode=MODE_READ, oldmap=None): if oldmap is None: oldmap = ServerMap() smu = ServermapUpdater(self._fn, self._storage_broker, Monitor(), oldmap, mode) d = smu.update() return d
def _get_servermap(self, mode): """ I am a serialized twin to get_servermap. """ servermap = ServerMap() d = self._update_servermap(servermap, mode) # The servermap will tell us about the most recent size of the # file, so we may as well set that so that callers might get # more data about us. if not self._most_recent_size: d.addCallback(self._get_size_from_servermap) return d
def _download_best_version(self): servermap = ServerMap() d = self._try_once_to_download_best_version(servermap, MODE_READ) def _maybe_retry(f): f.trap(NotEnoughSharesError) # the download is worth retrying once. Make sure to use the # old servermap, since it is what remembers the bad shares, # but use MODE_WRITE to make it look for even more shares. # TODO: consider allowing this to retry multiple times.. this # approach will let us tolerate about 8 bad shares, I think. return self._try_once_to_download_best_version( servermap, MODE_WRITE) d.addErrback(_maybe_retry) return d
def make_servermap(self, mode=MODE_CHECK, fn=None, sb=None, update_range=None): if fn is None: fn = self._fn if sb is None: sb = self._storage_broker smu = ServermapUpdater(fn, sb, Monitor(), ServerMap(), mode, update_range=update_range) d = smu.update() return d
def check(self, verify=False, add_lease=False): servermap = ServerMap() # Updating the servermap in MODE_CHECK will stand a good chance # of finding all of the shares, and getting a good idea of # recoverability, etc, without verifying. u = ServermapUpdater(self._node, self._storage_broker, self._monitor, servermap, self.SERVERMAP_MODE, add_lease=add_lease) if self._history: self._history.notify_mapupdate(u.get_status()) d = u.update() d.addCallback(self._got_mapupdate_results) if verify: d.addCallback(self._verify_all_shares) d.addCallback(lambda res: servermap) d.addCallback(self._make_checker_results) return d
def check(self, verify=False, add_lease=False): servermap = ServerMap() u = ServermapUpdater(self._node, self._storage_broker, self._monitor, servermap, MODE_CHECK, add_lease=add_lease) if self._history: self._history.notify_mapupdate(u.get_status()) d = u.update() d.addCallback(self._got_mapupdate_results) if verify: d.addCallback(self._verify_all_shares) d.addCallback(lambda res: servermap) d.addCallback(self._fill_checker_results, self.results) d.addCallback(lambda res: self.results) return d
def start(self, force=False): # download, then re-publish. If a server had a bad share, try to # replace it with a good one of the same shnum. # The normal repair operation should not be used to replace # application-specific merging of alternate versions: i.e if there # are multiple highest seqnums with different roothashes. In this # case, the application must use node.upload() (referencing the # servermap that indicates the multiple-heads condition), or # node.overwrite(). The repair() operation will refuse to run in # these conditions unless a force=True argument is provided. If # force=True is used, then the highest root hash will be reinforced. # Likewise, the presence of an unrecoverable latest version is an # unusual event, and should ideally be handled by retrying a couple # times (spaced out over hours or days) and hoping that new shares # will become available. If repair(force=True) is called, data will # be lost: a new seqnum will be generated with the same contents as # the most recent recoverable version, skipping over the lost # version. repair(force=False) will refuse to run in a situation like # this. # Repair is designed to fix the following injuries: # missing shares: add new ones to get at least N distinct ones # old shares: replace old shares with the latest version # bogus shares (bad sigs): replace the bad one with a good one # first, update the servermap in MODE_REPAIR, which files all shares # and makes sure we get the privkey. u = ServermapUpdater(self.node, self._storage_broker, self._monitor, ServerMap(), MODE_REPAIR) if self._history: self._history.notify_mapupdate(u.get_status()) d = u.update() d.addCallback(self._got_full_servermap, force) return d
def publish(self, newdata): """Publish the filenode's current contents. Returns a Deferred that fires (with None) when the publish has done as much work as it's ever going to do, or errbacks with ConsistencyError if it detects a simultaneous write. """ # 1: generate shares (SDMF: files are small, so we can do it in RAM) # 2: perform peer selection, get candidate servers # 2a: send queries to n+epsilon servers, to determine current shares # 2b: based upon responses, create target map # 3: send slot_testv_and_readv_and_writev messages # 4: as responses return, update share-dispatch table # 4a: may need to run recovery algorithm # 5: when enough responses are back, we're done self.log("starting publish, datalen is %s" % len(newdata)) self._status.set_size(len(newdata)) self._status.set_status("Started") self._started = time.time() self.done_deferred = defer.Deferred() self._writekey = self._node.get_writekey() assert self._writekey, "need write capability to publish" # first, which servers will we publish to? We require that the # servermap was updated in MODE_WRITE, so we can depend upon the # peerlist computed by that process instead of computing our own. if self._servermap: assert self._servermap.last_update_mode in (MODE_WRITE, MODE_CHECK) # we will push a version that is one larger than anything present # in the grid, according to the servermap. self._new_seqnum = self._servermap.highest_seqnum() + 1 else: # If we don't have a servermap, that's because we're doing the # initial publish self._new_seqnum = 1 self._servermap = ServerMap() self._status.set_servermap(self._servermap) self.log(format="new seqnum will be %(seqnum)d", seqnum=self._new_seqnum, level=log.NOISY) # having an up-to-date servermap (or using a filenode that was just # created for the first time) also guarantees that the following # fields are available self.readkey = self._node.get_readkey() self.required_shares = self._node.get_required_shares() assert self.required_shares is not None self.total_shares = self._node.get_total_shares() assert self.total_shares is not None self._status.set_encoding(self.required_shares, self.total_shares) self._pubkey = self._node.get_pubkey() assert self._pubkey self._privkey = self._node.get_privkey() assert self._privkey self._encprivkey = self._node.get_encprivkey() sb = self._storage_broker full_peerlist = [(s.get_serverid(), s.get_rref()) for s in sb.get_servers_for_psi(self._storage_index)] self.full_peerlist = full_peerlist # for use later, immutable self.bad_peers = set() # peerids who have errbacked/refused requests self.newdata = newdata self.salt = os.urandom(16) self.setup_encoding_parameters() # if we experience any surprises (writes which were rejected because # our test vector did not match, or shares which we didn't expect to # see), we set this flag and report an UncoordinatedWriteError at the # end of the publish process. self.surprised = False # as a failsafe, refuse to iterate through self.loop more than a # thousand times. self.looplimit = 1000 # we keep track of three tables. The first is our goal: which share # we want to see on which servers. This is initially populated by the # existing servermap. self.goal = set() # pairs of (peerid, shnum) tuples # the second table is our list of outstanding queries: those which # are in flight and may or may not be delivered, accepted, or # acknowledged. Items are added to this table when the request is # sent, and removed when the response returns (or errbacks). self.outstanding = set() # (peerid, shnum) tuples # the third is a table of successes: share which have actually been # placed. These are populated when responses come back with success. # When self.placed == self.goal, we're done. self.placed = set() # (peerid, shnum) tuples # we also keep a mapping from peerid to RemoteReference. Each time we # pull a connection out of the full peerlist, we add it to this for # use later. self.connections = {} self.bad_share_checkstrings = {} # we use the servermap to populate the initial goal: this way we will # try to update each existing share in place. for (peerid, shnum) in self._servermap.servermap: self.goal.add((peerid, shnum)) self.connections[peerid] = self._servermap.connections[peerid] # then we add in all the shares that were bad (corrupted, bad # signatures, etc). We want to replace these. for key, old_checkstring in self._servermap.bad_shares.items(): (peerid, shnum) = key self.goal.add(key) self.bad_share_checkstrings[key] = old_checkstring self.connections[peerid] = self._servermap.connections[peerid] # create the shares. We'll discard these as they are delivered. SDMF: # we're allowed to hold everything in memory. self._status.timings["setup"] = time.time() - self._started d = self._encrypt_and_encode() d.addCallback(self._generate_shares) def _start_pushing(res): self._started_pushing = time.time() return res d.addCallback(_start_pushing) d.addCallback(self.loop) # trigger delivery d.addErrback(self._fatal_error) return self.done_deferred
class Publish: """I represent a single act of publishing the mutable file to the grid. I will only publish my data if the servermap I am using still represents the current state of the world. To make the initial publish, set servermap to None. """ def __init__(self, filenode, storage_broker, servermap): self._node = filenode self._storage_broker = storage_broker self._servermap = servermap self._storage_index = self._node.get_storage_index() self._log_prefix = prefix = si_b2a(self._storage_index)[:5] num = self.log("Publish(%s): starting" % prefix, parent=None) self._log_number = num self._running = True self._first_write_error = None self._status = PublishStatus() self._status.set_storage_index(self._storage_index) self._status.set_helper(False) self._status.set_progress(0.0) self._status.set_active(True) def get_status(self): return self._status def log(self, *args, **kwargs): if 'parent' not in kwargs: kwargs['parent'] = self._log_number if "facility" not in kwargs: kwargs["facility"] = "tahoe.mutable.publish" return log.msg(*args, **kwargs) def publish(self, newdata): """Publish the filenode's current contents. Returns a Deferred that fires (with None) when the publish has done as much work as it's ever going to do, or errbacks with ConsistencyError if it detects a simultaneous write. """ # 1: generate shares (SDMF: files are small, so we can do it in RAM) # 2: perform peer selection, get candidate servers # 2a: send queries to n+epsilon servers, to determine current shares # 2b: based upon responses, create target map # 3: send slot_testv_and_readv_and_writev messages # 4: as responses return, update share-dispatch table # 4a: may need to run recovery algorithm # 5: when enough responses are back, we're done self.log("starting publish, datalen is %s" % len(newdata)) self._status.set_size(len(newdata)) self._status.set_status("Started") self._started = time.time() self.done_deferred = defer.Deferred() self._writekey = self._node.get_writekey() assert self._writekey, "need write capability to publish" # first, which servers will we publish to? We require that the # servermap was updated in MODE_WRITE, so we can depend upon the # peerlist computed by that process instead of computing our own. if self._servermap: assert self._servermap.last_update_mode in (MODE_WRITE, MODE_CHECK) # we will push a version that is one larger than anything present # in the grid, according to the servermap. self._new_seqnum = self._servermap.highest_seqnum() + 1 else: # If we don't have a servermap, that's because we're doing the # initial publish self._new_seqnum = 1 self._servermap = ServerMap() self._status.set_servermap(self._servermap) self.log(format="new seqnum will be %(seqnum)d", seqnum=self._new_seqnum, level=log.NOISY) # having an up-to-date servermap (or using a filenode that was just # created for the first time) also guarantees that the following # fields are available self.readkey = self._node.get_readkey() self.required_shares = self._node.get_required_shares() assert self.required_shares is not None self.total_shares = self._node.get_total_shares() assert self.total_shares is not None self._status.set_encoding(self.required_shares, self.total_shares) self._pubkey = self._node.get_pubkey() assert self._pubkey self._privkey = self._node.get_privkey() assert self._privkey self._encprivkey = self._node.get_encprivkey() sb = self._storage_broker full_peerlist = [(s.get_serverid(), s.get_rref()) for s in sb.get_servers_for_psi(self._storage_index)] self.full_peerlist = full_peerlist # for use later, immutable self.bad_peers = set() # peerids who have errbacked/refused requests self.newdata = newdata self.salt = os.urandom(16) self.setup_encoding_parameters() # if we experience any surprises (writes which were rejected because # our test vector did not match, or shares which we didn't expect to # see), we set this flag and report an UncoordinatedWriteError at the # end of the publish process. self.surprised = False # as a failsafe, refuse to iterate through self.loop more than a # thousand times. self.looplimit = 1000 # we keep track of three tables. The first is our goal: which share # we want to see on which servers. This is initially populated by the # existing servermap. self.goal = set() # pairs of (peerid, shnum) tuples # the second table is our list of outstanding queries: those which # are in flight and may or may not be delivered, accepted, or # acknowledged. Items are added to this table when the request is # sent, and removed when the response returns (or errbacks). self.outstanding = set() # (peerid, shnum) tuples # the third is a table of successes: share which have actually been # placed. These are populated when responses come back with success. # When self.placed == self.goal, we're done. self.placed = set() # (peerid, shnum) tuples # we also keep a mapping from peerid to RemoteReference. Each time we # pull a connection out of the full peerlist, we add it to this for # use later. self.connections = {} self.bad_share_checkstrings = {} # we use the servermap to populate the initial goal: this way we will # try to update each existing share in place. for (peerid, shnum) in self._servermap.servermap: self.goal.add((peerid, shnum)) self.connections[peerid] = self._servermap.connections[peerid] # then we add in all the shares that were bad (corrupted, bad # signatures, etc). We want to replace these. for key, old_checkstring in self._servermap.bad_shares.items(): (peerid, shnum) = key self.goal.add(key) self.bad_share_checkstrings[key] = old_checkstring self.connections[peerid] = self._servermap.connections[peerid] # create the shares. We'll discard these as they are delivered. SDMF: # we're allowed to hold everything in memory. self._status.timings["setup"] = time.time() - self._started d = self._encrypt_and_encode() d.addCallback(self._generate_shares) def _start_pushing(res): self._started_pushing = time.time() return res d.addCallback(_start_pushing) d.addCallback(self.loop) # trigger delivery d.addErrback(self._fatal_error) return self.done_deferred def setup_encoding_parameters(self): segment_size = len(self.newdata) # this must be a multiple of self.required_shares segment_size = mathutil.next_multiple(segment_size, self.required_shares) self.segment_size = segment_size if segment_size: self.num_segments = mathutil.div_ceil(len(self.newdata), segment_size) else: self.num_segments = 0 assert self.num_segments in [ 0, 1, ] # SDMF restrictions def _fatal_error(self, f): self.log("error during loop", failure=f, level=log.UNUSUAL) self._done(f) def _update_status(self): self._status.set_status( "Sending Shares: %d placed out of %d, " "%d messages outstanding" % (len(self.placed), len(self.goal), len(self.outstanding))) self._status.set_progress(1.0 * len(self.placed) / len(self.goal)) def loop(self, ignored=None): self.log("entering loop", level=log.NOISY) if not self._running: return self.looplimit -= 1 if self.looplimit <= 0: raise LoopLimitExceededError("loop limit exceeded") if self.surprised: # don't send out any new shares, just wait for the outstanding # ones to be retired. self.log("currently surprised, so don't send any new shares", level=log.NOISY) else: self.update_goal() # how far are we from our goal? needed = self.goal - self.placed - self.outstanding self._update_status() if needed: # we need to send out new shares self.log(format="need to send %(needed)d new shares", needed=len(needed), level=log.NOISY) self._send_shares(needed) return if self.outstanding: # queries are still pending, keep waiting self.log(format="%(outstanding)d queries still outstanding", outstanding=len(self.outstanding), level=log.NOISY) return # no queries outstanding, no placements needed: we're done self.log("no queries outstanding, no placements needed: done", level=log.OPERATIONAL) now = time.time() elapsed = now - self._started_pushing self._status.timings["push"] = elapsed return self._done(None) def log_goal(self, goal, message=""): logmsg = [message] for (shnum, peerid) in sorted([(s, p) for (p, s) in goal]): logmsg.append("sh%d to [%s]" % (shnum, idlib.shortnodeid_b2a(peerid))) self.log("current goal: %s" % (", ".join(logmsg)), level=log.NOISY) self.log("we are planning to push new seqnum=#%d" % self._new_seqnum, level=log.NOISY) def update_goal(self): # if log.recording_noisy if True: self.log_goal(self.goal, "before update: ") # first, remove any bad peers from our goal self.goal = set([(peerid, shnum) for (peerid, shnum) in self.goal if peerid not in self.bad_peers]) # find the homeless shares: homefull_shares = set([shnum for (peerid, shnum) in self.goal]) homeless_shares = set(range(self.total_shares)) - homefull_shares homeless_shares = sorted(list(homeless_shares)) # place them somewhere. We prefer unused servers at the beginning of # the available peer list. if not homeless_shares: return # if an old share X is on a node, put the new share X there too. # TODO: 1: redistribute shares to achieve one-per-peer, by copying # shares from existing peers to new (less-crowded) ones. The # old shares must still be updated. # TODO: 2: move those shares instead of copying them, to reduce future # update work # this is a bit CPU intensive but easy to analyze. We create a sort # order for each peerid. If the peerid is marked as bad, we don't # even put them in the list. Then we care about the number of shares # which have already been assigned to them. After that we care about # their permutation order. old_assignments = DictOfSets() for (peerid, shnum) in self.goal: old_assignments.add(peerid, shnum) peerlist = [] for i, (peerid, ss) in enumerate(self.full_peerlist): if peerid in self.bad_peers: continue entry = (len(old_assignments.get(peerid, [])), i, peerid, ss) peerlist.append(entry) peerlist.sort() if not peerlist: raise NotEnoughServersError( "Ran out of non-bad servers, " "first_error=%s" % str(self._first_write_error), self._first_write_error) # we then index this peerlist with an integer, because we may have to # wrap. We update the goal as we go. i = 0 for shnum in homeless_shares: (ignored1, ignored2, peerid, ss) = peerlist[i] # if we are forced to send a share to a server that already has # one, we may have two write requests in flight, and the # servermap (which was computed before either request was sent) # won't reflect the new shares, so the second response will be # surprising. There is code in _got_write_answer() to tolerate # this, otherwise it would cause the publish to fail with an # UncoordinatedWriteError. See #546 for details of the trouble # this used to cause. self.goal.add((peerid, shnum)) self.connections[peerid] = ss i += 1 if i >= len(peerlist): i = 0 if True: self.log_goal(self.goal, "after update: ") def _encrypt_and_encode(self): # this returns a Deferred that fires with a list of (sharedata, # sharenum) tuples. TODO: cache the ciphertext, only produce the # shares that we care about. self.log("_encrypt_and_encode") self._status.set_status("Encrypting") started = time.time() key = hashutil.ssk_readkey_data_hash(self.salt, self.readkey) enc = AES(key) crypttext = enc.process(self.newdata) assert len(crypttext) == len(self.newdata) now = time.time() self._status.timings["encrypt"] = now - started started = now # now apply FEC self._status.set_status("Encoding") fec = codec.CRSEncoder() fec.set_params(self.segment_size, self.required_shares, self.total_shares) piece_size = fec.get_block_size() crypttext_pieces = [None] * self.required_shares for i in range(len(crypttext_pieces)): offset = i * piece_size piece = crypttext[offset:offset + piece_size] piece = piece + "\x00" * (piece_size - len(piece)) # padding crypttext_pieces[i] = piece assert len(piece) == piece_size d = fec.encode(crypttext_pieces) def _done_encoding(res): elapsed = time.time() - started self._status.timings["encode"] = elapsed return res d.addCallback(_done_encoding) return d def _generate_shares(self, shares_and_shareids): # this sets self.shares and self.root_hash self.log("_generate_shares") self._status.set_status("Generating Shares") started = time.time() # we should know these by now privkey = self._privkey encprivkey = self._encprivkey pubkey = self._pubkey (shares, share_ids) = shares_and_shareids assert len(shares) == len(share_ids) assert len(shares) == self.total_shares all_shares = {} block_hash_trees = {} share_hash_leaves = [None] * len(shares) for i in range(len(shares)): share_data = shares[i] shnum = share_ids[i] all_shares[shnum] = share_data # build the block hash tree. SDMF has only one leaf. leaves = [hashutil.block_hash(share_data)] t = hashtree.HashTree(leaves) block_hash_trees[shnum] = list(t) share_hash_leaves[shnum] = t[0] for leaf in share_hash_leaves: assert leaf is not None share_hash_tree = hashtree.HashTree(share_hash_leaves) share_hash_chain = {} for shnum in range(self.total_shares): needed_hashes = share_hash_tree.needed_hashes(shnum) share_hash_chain[shnum] = dict([(i, share_hash_tree[i]) for i in needed_hashes]) root_hash = share_hash_tree[0] assert len(root_hash) == 32 self.log("my new root_hash is %s" % base32.b2a(root_hash)) self._new_version_info = (self._new_seqnum, root_hash, self.salt) prefix = pack_prefix(self._new_seqnum, root_hash, self.salt, self.required_shares, self.total_shares, self.segment_size, len(self.newdata)) # now pack the beginning of the share. All shares are the same up # to the signature, then they have divergent share hash chains, # then completely different block hash trees + salt + share data, # then they all share the same encprivkey at the end. The sizes # of everything are the same for all shares. sign_started = time.time() signature = privkey.sign(prefix) self._status.timings["sign"] = time.time() - sign_started verification_key = pubkey.serialize() final_shares = {} for shnum in range(self.total_shares): final_share = pack_share(prefix, verification_key, signature, share_hash_chain[shnum], block_hash_trees[shnum], all_shares[shnum], encprivkey) final_shares[shnum] = final_share elapsed = time.time() - started self._status.timings["pack"] = elapsed self.shares = final_shares self.root_hash = root_hash # we also need to build up the version identifier for what we're # pushing. Extract the offsets from one of our shares. assert final_shares offsets = unpack_header(final_shares.values()[0])[-1] offsets_tuple = tuple([(key, value) for key, value in offsets.items()]) verinfo = (self._new_seqnum, root_hash, self.salt, self.segment_size, len(self.newdata), self.required_shares, self.total_shares, prefix, offsets_tuple) self.versioninfo = verinfo def _send_shares(self, needed): self.log("_send_shares") # we're finally ready to send out our shares. If we encounter any # surprises here, it's because somebody else is writing at the same # time. (Note: in the future, when we remove the _query_peers() step # and instead speculate about [or remember] which shares are where, # surprises here are *not* indications of UncoordinatedWriteError, # and we'll need to respond to them more gracefully.) # needed is a set of (peerid, shnum) tuples. The first thing we do is # organize it by peerid. peermap = DictOfSets() for (peerid, shnum) in needed: peermap.add(peerid, shnum) # the next thing is to build up a bunch of test vectors. The # semantics of Publish are that we perform the operation if the world # hasn't changed since the ServerMap was constructed (more or less). # For every share we're trying to place, we create a test vector that # tests to see if the server*share still corresponds to the # map. all_tw_vectors = {} # maps peerid to tw_vectors sm = self._servermap.servermap for key in needed: (peerid, shnum) = key if key in sm: # an old version of that share already exists on the # server, according to our servermap. We will create a # request that attempts to replace it. old_versionid, old_timestamp = sm[key] (old_seqnum, old_root_hash, old_salt, old_segsize, old_datalength, old_k, old_N, old_prefix, old_offsets_tuple) = old_versionid old_checkstring = pack_checkstring(old_seqnum, old_root_hash, old_salt) testv = (0, len(old_checkstring), "eq", old_checkstring) elif key in self.bad_share_checkstrings: old_checkstring = self.bad_share_checkstrings[key] testv = (0, len(old_checkstring), "eq", old_checkstring) else: # add a testv that requires the share not exist # Unfortunately, foolscap-0.2.5 has a bug in the way inbound # constraints are handled. If the same object is referenced # multiple times inside the arguments, foolscap emits a # 'reference' token instead of a distinct copy of the # argument. The bug is that these 'reference' tokens are not # accepted by the inbound constraint code. To work around # this, we need to prevent python from interning the # (constant) tuple, by creating a new copy of this vector # each time. # This bug is fixed in foolscap-0.2.6, and even though this # version of Tahoe requires foolscap-0.3.1 or newer, we are # supposed to be able to interoperate with older versions of # Tahoe which are allowed to use older versions of foolscap, # including foolscap-0.2.5 . In addition, I've seen other # foolscap problems triggered by 'reference' tokens (see #541 # for details). So we must keep this workaround in place. #testv = (0, 1, 'eq', "") testv = tuple([0, 1, 'eq', ""]) testvs = [testv] # the write vector is simply the share writev = [(0, self.shares[shnum])] if peerid not in all_tw_vectors: all_tw_vectors[peerid] = {} # maps shnum to (testvs, writevs, new_length) assert shnum not in all_tw_vectors[peerid] all_tw_vectors[peerid][shnum] = (testvs, writev, None) # we read the checkstring back from each share, however we only use # it to detect whether there was a new share that we didn't know # about. The success or failure of the write will tell us whether # there was a collision or not. If there is a collision, the first # thing we'll do is update the servermap, which will find out what # happened. We could conceivably reduce a roundtrip by using the # readv checkstring to populate the servermap, but really we'd have # to read enough data to validate the signatures too, so it wouldn't # be an overall win. read_vector = [(0, struct.calcsize(SIGNED_PREFIX))] # ok, send the messages! self.log("sending %d shares" % len(all_tw_vectors), level=log.NOISY) started = time.time() for (peerid, tw_vectors) in all_tw_vectors.items(): write_enabler = self._node.get_write_enabler(peerid) renew_secret = self._node.get_renewal_secret(peerid) cancel_secret = self._node.get_cancel_secret(peerid) secrets = (write_enabler, renew_secret, cancel_secret) shnums = tw_vectors.keys() for shnum in shnums: self.outstanding.add((peerid, shnum)) d = self._do_testreadwrite(peerid, secrets, tw_vectors, read_vector) d.addCallbacks(self._got_write_answer, self._got_write_error, callbackArgs=(peerid, shnums, started), errbackArgs=(peerid, shnums, started)) # tolerate immediate errback, like with DeadReferenceError d.addBoth(fireEventually) d.addCallback(self.loop) d.addErrback(self._fatal_error) self._update_status() self.log("%d shares sent" % len(all_tw_vectors), level=log.NOISY) def _do_testreadwrite(self, peerid, secrets, tw_vectors, read_vector): storage_index = self._storage_index ss = self.connections[peerid] #print "SS[%s] is %s" % (idlib.shortnodeid_b2a(peerid), ss), ss.tracker.interfaceName d = ss.callRemote("slot_testv_and_readv_and_writev", storage_index, secrets, tw_vectors, read_vector) return d def _got_write_answer(self, answer, peerid, shnums, started): lp = self.log("_got_write_answer from %s" % idlib.shortnodeid_b2a(peerid)) for shnum in shnums: self.outstanding.discard((peerid, shnum)) now = time.time() elapsed = now - started self._status.add_per_server_time(peerid, elapsed) wrote, read_data = answer surprise_shares = set(read_data.keys()) - set(shnums) surprised = False for shnum in surprise_shares: # read_data is a dict mapping shnum to checkstring (SIGNED_PREFIX) checkstring = read_data[shnum][0] their_version_info = unpack_checkstring(checkstring) if their_version_info == self._new_version_info: # they have the right share, somehow if (peerid, shnum) in self.goal: # and we want them to have it, so we probably sent them a # copy in an earlier write. This is ok, and avoids the # #546 problem. continue # They aren't in our goal, but they are still for the right # version. Somebody else wrote them, and it's a convergent # uncoordinated write. Pretend this is ok (don't be # surprised), since I suspect there's a decent chance that # we'll hit this in normal operation. continue else: # the new shares are of a different version if peerid in self._servermap.reachable_peers: # we asked them about their shares, so we had knowledge # of what they used to have. Any surprising shares must # have come from someone else, so UCW. surprised = True else: # we didn't ask them, and now we've discovered that they # have a share we didn't know about. This indicates that # mapupdate should have wokred harder and asked more # servers before concluding that it knew about them all. # signal UCW, but make sure to ask this peer next time, # so we'll remember to update it if/when we retry. surprised = True # TODO: ask this peer next time. I don't yet have a good # way to do this. Two insufficient possibilities are: # # self._servermap.add_new_share(peerid, shnum, verinfo, now) # but that requires fetching/validating/parsing the whole # version string, and all we have is the checkstring # self._servermap.mark_bad_share(peerid, shnum, checkstring) # that will make publish overwrite the share next time, # but it won't re-query the server, and it won't make # mapupdate search further # TODO later: when publish starts, do # servermap.get_best_version(), extract the seqnum, # subtract one, and store as highest-replaceable-seqnum. # Then, if this surprise-because-we-didn't-ask share is # of highest-replaceable-seqnum or lower, we're allowed # to replace it: send out a new writev (or rather add it # to self.goal and loop). pass surprised = True if surprised: self.log("they had shares %s that we didn't know about" % (list(surprise_shares), ), parent=lp, level=log.WEIRD, umid="un9CSQ") self.surprised = True if not wrote: # TODO: there are two possibilities. The first is that the server # is full (or just doesn't want to give us any room), which means # we shouldn't ask them again, but is *not* an indication of an # uncoordinated write. The second is that our testv failed, which # *does* indicate an uncoordinated write. We currently don't have # a way to tell these two apart (in fact, the storage server code # doesn't have the option of refusing our share). # # If the server is full, mark the peer as bad (so we don't ask # them again), but don't set self.surprised. The loop() will find # a new server. # # If the testv failed, log it, set self.surprised, but don't # bother adding to self.bad_peers . self.log("our testv failed, so the write did not happen", parent=lp, level=log.WEIRD, umid="8sc26g") self.surprised = True self.bad_peers.add(peerid) # don't ask them again # use the checkstring to add information to the log message for (shnum, readv) in read_data.items(): checkstring = readv[0] (other_seqnum, other_roothash, other_salt) = unpack_checkstring(checkstring) expected_version = self._servermap.version_on_peer( peerid, shnum) if expected_version: (seqnum, root_hash, IV, segsize, datalength, k, N, prefix, offsets_tuple) = expected_version self.log("somebody modified the share on us:" " shnum=%d: I thought they had #%d:R=%s," " but testv reported #%d:R=%s" % (shnum, seqnum, base32.b2a(root_hash)[:4], other_seqnum, base32.b2a(other_roothash)[:4]), parent=lp, level=log.NOISY) # if expected_version==None, then we didn't expect to see a # share on that peer, and the 'surprise_shares' clause above # will have logged it. # self.loop() will take care of finding new homes return for shnum in shnums: self.placed.add((peerid, shnum)) # and update the servermap self._servermap.add_new_share(peerid, shnum, self.versioninfo, started) # self.loop() will take care of checking to see if we're done return def _got_write_error(self, f, peerid, shnums, started): for shnum in shnums: self.outstanding.discard((peerid, shnum)) self.bad_peers.add(peerid) if self._first_write_error is None: self._first_write_error = f self.log( format="error while writing shares %(shnums)s to peerid %(peerid)s", shnums=list(shnums), peerid=idlib.shortnodeid_b2a(peerid), failure=f, level=log.UNUSUAL) # self.loop() will take care of checking to see if we're done return def _done(self, res): if not self._running: return self._running = False now = time.time() self._status.timings["total"] = now - self._started self._status.set_active(False) if isinstance(res, failure.Failure): self.log("Publish done, with failure", failure=res, level=log.WEIRD, umid="nRsR9Q") self._status.set_status("Failed") elif self.surprised: self.log("Publish done, UncoordinatedWriteError", level=log.UNUSUAL) self._status.set_status("UncoordinatedWriteError") # deliver a failure res = failure.Failure(UncoordinatedWriteError()) # TODO: recovery else: self.log("Publish done, success") self._status.set_status("Finished") self._status.set_progress(1.0) eventually(self.done_deferred.callback, res)
def _get_servermap(self, mode): servermap = ServerMap() return self._update_servermap(servermap, mode)
def _modify(self, modifier, backoffer): servermap = ServerMap() if backoffer is None: backoffer = BackoffAgent().delay return self._modify_and_retry(servermap, modifier, backoffer, True)
def _overwrite(self, new_contents): servermap = ServerMap() d = self._update_servermap(servermap, mode=MODE_WRITE) d.addCallback(lambda ignored: self._upload(new_contents, servermap)) return d
class Publish: """I represent a single act of publishing the mutable file to the grid. I will only publish my data if the servermap I am using still represents the current state of the world. To make the initial publish, set servermap to None. """ def __init__(self, filenode, storage_broker, servermap): self._node = filenode self._storage_broker = storage_broker self._servermap = servermap self._storage_index = self._node.get_storage_index() self._log_prefix = prefix = si_b2a(self._storage_index)[:5] num = self.log("Publish(%s): starting" % prefix, parent=None) self._log_number = num self._running = True self._first_write_error = None self._status = PublishStatus() self._status.set_storage_index(self._storage_index) self._status.set_helper(False) self._status.set_progress(0.0) self._status.set_active(True) def get_status(self): return self._status def log(self, *args, **kwargs): if 'parent' not in kwargs: kwargs['parent'] = self._log_number if "facility" not in kwargs: kwargs["facility"] = "tahoe.mutable.publish" return log.msg(*args, **kwargs) def publish(self, newdata): """Publish the filenode's current contents. Returns a Deferred that fires (with None) when the publish has done as much work as it's ever going to do, or errbacks with ConsistencyError if it detects a simultaneous write. """ # 1: generate shares (SDMF: files are small, so we can do it in RAM) # 2: perform peer selection, get candidate servers # 2a: send queries to n+epsilon servers, to determine current shares # 2b: based upon responses, create target map # 3: send slot_testv_and_readv_and_writev messages # 4: as responses return, update share-dispatch table # 4a: may need to run recovery algorithm # 5: when enough responses are back, we're done self.log("starting publish, datalen is %s" % len(newdata)) self._status.set_size(len(newdata)) self._status.set_status("Started") self._started = time.time() self.done_deferred = defer.Deferred() self._writekey = self._node.get_writekey() assert self._writekey, "need write capability to publish" # first, which servers will we publish to? We require that the # servermap was updated in MODE_WRITE, so we can depend upon the # peerlist computed by that process instead of computing our own. if self._servermap: assert self._servermap.last_update_mode in (MODE_WRITE, MODE_CHECK) # we will push a version that is one larger than anything present # in the grid, according to the servermap. self._new_seqnum = self._servermap.highest_seqnum() + 1 else: # If we don't have a servermap, that's because we're doing the # initial publish self._new_seqnum = 1 self._servermap = ServerMap() self._status.set_servermap(self._servermap) self.log(format="new seqnum will be %(seqnum)d", seqnum=self._new_seqnum, level=log.NOISY) # having an up-to-date servermap (or using a filenode that was just # created for the first time) also guarantees that the following # fields are available self.readkey = self._node.get_readkey() self.required_shares = self._node.get_required_shares() assert self.required_shares is not None self.total_shares = self._node.get_total_shares() assert self.total_shares is not None self._status.set_encoding(self.required_shares, self.total_shares) self._pubkey = self._node.get_pubkey() assert self._pubkey self._privkey = self._node.get_privkey() assert self._privkey self._encprivkey = self._node.get_encprivkey() sb = self._storage_broker full_peerlist = sb.get_servers_for_index(self._storage_index) self.full_peerlist = full_peerlist # for use later, immutable self.bad_peers = set() # peerids who have errbacked/refused requests self.newdata = newdata self.salt = os.urandom(16) self.setup_encoding_parameters() # if we experience any surprises (writes which were rejected because # our test vector did not match, or shares which we didn't expect to # see), we set this flag and report an UncoordinatedWriteError at the # end of the publish process. self.surprised = False # as a failsafe, refuse to iterate through self.loop more than a # thousand times. self.looplimit = 1000 # we keep track of three tables. The first is our goal: which share # we want to see on which servers. This is initially populated by the # existing servermap. self.goal = set() # pairs of (peerid, shnum) tuples # the second table is our list of outstanding queries: those which # are in flight and may or may not be delivered, accepted, or # acknowledged. Items are added to this table when the request is # sent, and removed when the response returns (or errbacks). self.outstanding = set() # (peerid, shnum) tuples # the third is a table of successes: share which have actually been # placed. These are populated when responses come back with success. # When self.placed == self.goal, we're done. self.placed = set() # (peerid, shnum) tuples # we also keep a mapping from peerid to RemoteReference. Each time we # pull a connection out of the full peerlist, we add it to this for # use later. self.connections = {} self.bad_share_checkstrings = {} # we use the servermap to populate the initial goal: this way we will # try to update each existing share in place. for (peerid, shnum) in self._servermap.servermap: self.goal.add( (peerid, shnum) ) self.connections[peerid] = self._servermap.connections[peerid] # then we add in all the shares that were bad (corrupted, bad # signatures, etc). We want to replace these. for key, old_checkstring in self._servermap.bad_shares.items(): (peerid, shnum) = key self.goal.add(key) self.bad_share_checkstrings[key] = old_checkstring self.connections[peerid] = self._servermap.connections[peerid] # create the shares. We'll discard these as they are delivered. SDMF: # we're allowed to hold everything in memory. self._status.timings["setup"] = time.time() - self._started d = self._encrypt_and_encode() d.addCallback(self._generate_shares) def _start_pushing(res): self._started_pushing = time.time() return res d.addCallback(_start_pushing) d.addCallback(self.loop) # trigger delivery d.addErrback(self._fatal_error) return self.done_deferred def setup_encoding_parameters(self): segment_size = len(self.newdata) # this must be a multiple of self.required_shares segment_size = mathutil.next_multiple(segment_size, self.required_shares) self.segment_size = segment_size if segment_size: self.num_segments = mathutil.div_ceil(len(self.newdata), segment_size) else: self.num_segments = 0 assert self.num_segments in [0, 1,] # SDMF restrictions def _fatal_error(self, f): self.log("error during loop", failure=f, level=log.UNUSUAL) self._done(f) def _update_status(self): self._status.set_status("Sending Shares: %d placed out of %d, " "%d messages outstanding" % (len(self.placed), len(self.goal), len(self.outstanding))) self._status.set_progress(1.0 * len(self.placed) / len(self.goal)) def loop(self, ignored=None): self.log("entering loop", level=log.NOISY) if not self._running: return self.looplimit -= 1 if self.looplimit <= 0: raise LoopLimitExceededError("loop limit exceeded") if self.surprised: # don't send out any new shares, just wait for the outstanding # ones to be retired. self.log("currently surprised, so don't send any new shares", level=log.NOISY) else: self.update_goal() # how far are we from our goal? needed = self.goal - self.placed - self.outstanding self._update_status() if needed: # we need to send out new shares self.log(format="need to send %(needed)d new shares", needed=len(needed), level=log.NOISY) self._send_shares(needed) return if self.outstanding: # queries are still pending, keep waiting self.log(format="%(outstanding)d queries still outstanding", outstanding=len(self.outstanding), level=log.NOISY) return # no queries outstanding, no placements needed: we're done self.log("no queries outstanding, no placements needed: done", level=log.OPERATIONAL) now = time.time() elapsed = now - self._started_pushing self._status.timings["push"] = elapsed return self._done(None) def log_goal(self, goal, message=""): logmsg = [message] for (shnum, peerid) in sorted([(s,p) for (p,s) in goal]): logmsg.append("sh%d to [%s]" % (shnum, idlib.shortnodeid_b2a(peerid))) self.log("current goal: %s" % (", ".join(logmsg)), level=log.NOISY) self.log("we are planning to push new seqnum=#%d" % self._new_seqnum, level=log.NOISY) def update_goal(self): # if log.recording_noisy if True: self.log_goal(self.goal, "before update: ") # first, remove any bad peers from our goal self.goal = set([ (peerid, shnum) for (peerid, shnum) in self.goal if peerid not in self.bad_peers ]) # find the homeless shares: homefull_shares = set([shnum for (peerid, shnum) in self.goal]) homeless_shares = set(range(self.total_shares)) - homefull_shares homeless_shares = sorted(list(homeless_shares)) # place them somewhere. We prefer unused servers at the beginning of # the available peer list. if not homeless_shares: return # if an old share X is on a node, put the new share X there too. # TODO: 1: redistribute shares to achieve one-per-peer, by copying # shares from existing peers to new (less-crowded) ones. The # old shares must still be updated. # TODO: 2: move those shares instead of copying them, to reduce future # update work # this is a bit CPU intensive but easy to analyze. We create a sort # order for each peerid. If the peerid is marked as bad, we don't # even put them in the list. Then we care about the number of shares # which have already been assigned to them. After that we care about # their permutation order. old_assignments = DictOfSets() for (peerid, shnum) in self.goal: old_assignments.add(peerid, shnum) peerlist = [] for i, (peerid, ss) in enumerate(self.full_peerlist): if peerid in self.bad_peers: continue entry = (len(old_assignments.get(peerid, [])), i, peerid, ss) peerlist.append(entry) peerlist.sort() if not peerlist: raise NotEnoughServersError("Ran out of non-bad servers, " "first_error=%s" % str(self._first_write_error), self._first_write_error) # we then index this peerlist with an integer, because we may have to # wrap. We update the goal as we go. i = 0 for shnum in homeless_shares: (ignored1, ignored2, peerid, ss) = peerlist[i] # if we are forced to send a share to a server that already has # one, we may have two write requests in flight, and the # servermap (which was computed before either request was sent) # won't reflect the new shares, so the second response will be # surprising. There is code in _got_write_answer() to tolerate # this, otherwise it would cause the publish to fail with an # UncoordinatedWriteError. See #546 for details of the trouble # this used to cause. self.goal.add( (peerid, shnum) ) self.connections[peerid] = ss i += 1 if i >= len(peerlist): i = 0 if True: self.log_goal(self.goal, "after update: ") def _encrypt_and_encode(self): # this returns a Deferred that fires with a list of (sharedata, # sharenum) tuples. TODO: cache the ciphertext, only produce the # shares that we care about. self.log("_encrypt_and_encode") self._status.set_status("Encrypting") started = time.time() key = hashutil.ssk_readkey_data_hash(self.salt, self.readkey) enc = AES(key) crypttext = enc.process(self.newdata) assert len(crypttext) == len(self.newdata) now = time.time() self._status.timings["encrypt"] = now - started started = now # now apply FEC self._status.set_status("Encoding") fec = codec.CRSEncoder() fec.set_params(self.segment_size, self.required_shares, self.total_shares) piece_size = fec.get_block_size() crypttext_pieces = [None] * self.required_shares for i in range(len(crypttext_pieces)): offset = i * piece_size piece = crypttext[offset:offset+piece_size] piece = piece + "\x00"*(piece_size - len(piece)) # padding crypttext_pieces[i] = piece assert len(piece) == piece_size d = fec.encode(crypttext_pieces) def _done_encoding(res): elapsed = time.time() - started self._status.timings["encode"] = elapsed return res d.addCallback(_done_encoding) return d def _generate_shares(self, shares_and_shareids): # this sets self.shares and self.root_hash self.log("_generate_shares") self._status.set_status("Generating Shares") started = time.time() # we should know these by now privkey = self._privkey encprivkey = self._encprivkey pubkey = self._pubkey (shares, share_ids) = shares_and_shareids assert len(shares) == len(share_ids) assert len(shares) == self.total_shares all_shares = {} block_hash_trees = {} share_hash_leaves = [None] * len(shares) for i in range(len(shares)): share_data = shares[i] shnum = share_ids[i] all_shares[shnum] = share_data # build the block hash tree. SDMF has only one leaf. leaves = [hashutil.block_hash(share_data)] t = hashtree.HashTree(leaves) block_hash_trees[shnum] = list(t) share_hash_leaves[shnum] = t[0] for leaf in share_hash_leaves: assert leaf is not None share_hash_tree = hashtree.HashTree(share_hash_leaves) share_hash_chain = {} for shnum in range(self.total_shares): needed_hashes = share_hash_tree.needed_hashes(shnum) share_hash_chain[shnum] = dict( [ (i, share_hash_tree[i]) for i in needed_hashes ] ) root_hash = share_hash_tree[0] assert len(root_hash) == 32 self.log("my new root_hash is %s" % base32.b2a(root_hash)) self._new_version_info = (self._new_seqnum, root_hash, self.salt) prefix = pack_prefix(self._new_seqnum, root_hash, self.salt, self.required_shares, self.total_shares, self.segment_size, len(self.newdata)) # now pack the beginning of the share. All shares are the same up # to the signature, then they have divergent share hash chains, # then completely different block hash trees + salt + share data, # then they all share the same encprivkey at the end. The sizes # of everything are the same for all shares. sign_started = time.time() signature = privkey.sign(prefix) self._status.timings["sign"] = time.time() - sign_started verification_key = pubkey.serialize() final_shares = {} for shnum in range(self.total_shares): final_share = pack_share(prefix, verification_key, signature, share_hash_chain[shnum], block_hash_trees[shnum], all_shares[shnum], encprivkey) final_shares[shnum] = final_share elapsed = time.time() - started self._status.timings["pack"] = elapsed self.shares = final_shares self.root_hash = root_hash # we also need to build up the version identifier for what we're # pushing. Extract the offsets from one of our shares. assert final_shares offsets = unpack_header(final_shares.values()[0])[-1] offsets_tuple = tuple( [(key,value) for key,value in offsets.items()] ) verinfo = (self._new_seqnum, root_hash, self.salt, self.segment_size, len(self.newdata), self.required_shares, self.total_shares, prefix, offsets_tuple) self.versioninfo = verinfo def _send_shares(self, needed): self.log("_send_shares") # we're finally ready to send out our shares. If we encounter any # surprises here, it's because somebody else is writing at the same # time. (Note: in the future, when we remove the _query_peers() step # and instead speculate about [or remember] which shares are where, # surprises here are *not* indications of UncoordinatedWriteError, # and we'll need to respond to them more gracefully.) # needed is a set of (peerid, shnum) tuples. The first thing we do is # organize it by peerid. peermap = DictOfSets() for (peerid, shnum) in needed: peermap.add(peerid, shnum) # the next thing is to build up a bunch of test vectors. The # semantics of Publish are that we perform the operation if the world # hasn't changed since the ServerMap was constructed (more or less). # For every share we're trying to place, we create a test vector that # tests to see if the server*share still corresponds to the # map. all_tw_vectors = {} # maps peerid to tw_vectors sm = self._servermap.servermap for key in needed: (peerid, shnum) = key if key in sm: # an old version of that share already exists on the # server, according to our servermap. We will create a # request that attempts to replace it. old_versionid, old_timestamp = sm[key] (old_seqnum, old_root_hash, old_salt, old_segsize, old_datalength, old_k, old_N, old_prefix, old_offsets_tuple) = old_versionid old_checkstring = pack_checkstring(old_seqnum, old_root_hash, old_salt) testv = (0, len(old_checkstring), "eq", old_checkstring) elif key in self.bad_share_checkstrings: old_checkstring = self.bad_share_checkstrings[key] testv = (0, len(old_checkstring), "eq", old_checkstring) else: # add a testv that requires the share not exist # Unfortunately, foolscap-0.2.5 has a bug in the way inbound # constraints are handled. If the same object is referenced # multiple times inside the arguments, foolscap emits a # 'reference' token instead of a distinct copy of the # argument. The bug is that these 'reference' tokens are not # accepted by the inbound constraint code. To work around # this, we need to prevent python from interning the # (constant) tuple, by creating a new copy of this vector # each time. # This bug is fixed in foolscap-0.2.6, and even though this # version of Tahoe requires foolscap-0.3.1 or newer, we are # supposed to be able to interoperate with older versions of # Tahoe which are allowed to use older versions of foolscap, # including foolscap-0.2.5 . In addition, I've seen other # foolscap problems triggered by 'reference' tokens (see #541 # for details). So we must keep this workaround in place. #testv = (0, 1, 'eq', "") testv = tuple([0, 1, 'eq', ""]) testvs = [testv] # the write vector is simply the share writev = [(0, self.shares[shnum])] if peerid not in all_tw_vectors: all_tw_vectors[peerid] = {} # maps shnum to (testvs, writevs, new_length) assert shnum not in all_tw_vectors[peerid] all_tw_vectors[peerid][shnum] = (testvs, writev, None) # we read the checkstring back from each share, however we only use # it to detect whether there was a new share that we didn't know # about. The success or failure of the write will tell us whether # there was a collision or not. If there is a collision, the first # thing we'll do is update the servermap, which will find out what # happened. We could conceivably reduce a roundtrip by using the # readv checkstring to populate the servermap, but really we'd have # to read enough data to validate the signatures too, so it wouldn't # be an overall win. read_vector = [(0, struct.calcsize(SIGNED_PREFIX))] # ok, send the messages! self.log("sending %d shares" % len(all_tw_vectors), level=log.NOISY) started = time.time() for (peerid, tw_vectors) in all_tw_vectors.items(): write_enabler = self._node.get_write_enabler(peerid) renew_secret = self._node.get_renewal_secret(peerid) cancel_secret = self._node.get_cancel_secret(peerid) secrets = (write_enabler, renew_secret, cancel_secret) shnums = tw_vectors.keys() for shnum in shnums: self.outstanding.add( (peerid, shnum) ) d = self._do_testreadwrite(peerid, secrets, tw_vectors, read_vector) d.addCallbacks(self._got_write_answer, self._got_write_error, callbackArgs=(peerid, shnums, started), errbackArgs=(peerid, shnums, started)) # tolerate immediate errback, like with DeadReferenceError d.addBoth(fireEventually) d.addCallback(self.loop) d.addErrback(self._fatal_error) self._update_status() self.log("%d shares sent" % len(all_tw_vectors), level=log.NOISY) def _do_testreadwrite(self, peerid, secrets, tw_vectors, read_vector): storage_index = self._storage_index ss = self.connections[peerid] #print "SS[%s] is %s" % (idlib.shortnodeid_b2a(peerid), ss), ss.tracker.interfaceName d = ss.callRemote("slot_testv_and_readv_and_writev", storage_index, secrets, tw_vectors, read_vector) return d def _got_write_answer(self, answer, peerid, shnums, started): lp = self.log("_got_write_answer from %s" % idlib.shortnodeid_b2a(peerid)) for shnum in shnums: self.outstanding.discard( (peerid, shnum) ) now = time.time() elapsed = now - started self._status.add_per_server_time(peerid, elapsed) wrote, read_data = answer surprise_shares = set(read_data.keys()) - set(shnums) surprised = False for shnum in surprise_shares: # read_data is a dict mapping shnum to checkstring (SIGNED_PREFIX) checkstring = read_data[shnum][0] their_version_info = unpack_checkstring(checkstring) if their_version_info == self._new_version_info: # they have the right share, somehow if (peerid,shnum) in self.goal: # and we want them to have it, so we probably sent them a # copy in an earlier write. This is ok, and avoids the # #546 problem. continue # They aren't in our goal, but they are still for the right # version. Somebody else wrote them, and it's a convergent # uncoordinated write. Pretend this is ok (don't be # surprised), since I suspect there's a decent chance that # we'll hit this in normal operation. continue else: # the new shares are of a different version if peerid in self._servermap.reachable_peers: # we asked them about their shares, so we had knowledge # of what they used to have. Any surprising shares must # have come from someone else, so UCW. surprised = True else: # we didn't ask them, and now we've discovered that they # have a share we didn't know about. This indicates that # mapupdate should have wokred harder and asked more # servers before concluding that it knew about them all. # signal UCW, but make sure to ask this peer next time, # so we'll remember to update it if/when we retry. surprised = True # TODO: ask this peer next time. I don't yet have a good # way to do this. Two insufficient possibilities are: # # self._servermap.add_new_share(peerid, shnum, verinfo, now) # but that requires fetching/validating/parsing the whole # version string, and all we have is the checkstring # self._servermap.mark_bad_share(peerid, shnum, checkstring) # that will make publish overwrite the share next time, # but it won't re-query the server, and it won't make # mapupdate search further # TODO later: when publish starts, do # servermap.get_best_version(), extract the seqnum, # subtract one, and store as highest-replaceable-seqnum. # Then, if this surprise-because-we-didn't-ask share is # of highest-replaceable-seqnum or lower, we're allowed # to replace it: send out a new writev (or rather add it # to self.goal and loop). pass surprised = True if surprised: self.log("they had shares %s that we didn't know about" % (list(surprise_shares),), parent=lp, level=log.WEIRD, umid="un9CSQ") self.surprised = True if not wrote: # TODO: there are two possibilities. The first is that the server # is full (or just doesn't want to give us any room), which means # we shouldn't ask them again, but is *not* an indication of an # uncoordinated write. The second is that our testv failed, which # *does* indicate an uncoordinated write. We currently don't have # a way to tell these two apart (in fact, the storage server code # doesn't have the option of refusing our share). # # If the server is full, mark the peer as bad (so we don't ask # them again), but don't set self.surprised. The loop() will find # a new server. # # If the testv failed, log it, set self.surprised, but don't # bother adding to self.bad_peers . self.log("our testv failed, so the write did not happen", parent=lp, level=log.WEIRD, umid="8sc26g") self.surprised = True self.bad_peers.add(peerid) # don't ask them again # use the checkstring to add information to the log message for (shnum,readv) in read_data.items(): checkstring = readv[0] (other_seqnum, other_roothash, other_salt) = unpack_checkstring(checkstring) expected_version = self._servermap.version_on_peer(peerid, shnum) if expected_version: (seqnum, root_hash, IV, segsize, datalength, k, N, prefix, offsets_tuple) = expected_version self.log("somebody modified the share on us:" " shnum=%d: I thought they had #%d:R=%s," " but testv reported #%d:R=%s" % (shnum, seqnum, base32.b2a(root_hash)[:4], other_seqnum, base32.b2a(other_roothash)[:4]), parent=lp, level=log.NOISY) # if expected_version==None, then we didn't expect to see a # share on that peer, and the 'surprise_shares' clause above # will have logged it. # self.loop() will take care of finding new homes return for shnum in shnums: self.placed.add( (peerid, shnum) ) # and update the servermap self._servermap.add_new_share(peerid, shnum, self.versioninfo, started) # self.loop() will take care of checking to see if we're done return def _got_write_error(self, f, peerid, shnums, started): for shnum in shnums: self.outstanding.discard( (peerid, shnum) ) self.bad_peers.add(peerid) if self._first_write_error is None: self._first_write_error = f self.log(format="error while writing shares %(shnums)s to peerid %(peerid)s", shnums=list(shnums), peerid=idlib.shortnodeid_b2a(peerid), failure=f, level=log.UNUSUAL) # self.loop() will take care of checking to see if we're done return def _done(self, res): if not self._running: return self._running = False now = time.time() self._status.timings["total"] = now - self._started self._status.set_active(False) if isinstance(res, failure.Failure): self.log("Publish done, with failure", failure=res, level=log.WEIRD, umid="nRsR9Q") self._status.set_status("Failed") elif self.surprised: self.log("Publish done, UncoordinatedWriteError", level=log.UNUSUAL) self._status.set_status("UncoordinatedWriteError") # deliver a failure res = failure.Failure(UncoordinatedWriteError()) # TODO: recovery else: self.log("Publish done, success") self._status.set_status("Finished") self._status.set_progress(1.0) eventually(self.done_deferred.callback, res)
def publish(self, newdata): """Publish the filenode's current contents. Returns a Deferred that fires (with None) when the publish has done as much work as it's ever going to do, or errbacks with ConsistencyError if it detects a simultaneous write. """ # 1: generate shares (SDMF: files are small, so we can do it in RAM) # 2: perform peer selection, get candidate servers # 2a: send queries to n+epsilon servers, to determine current shares # 2b: based upon responses, create target map # 3: send slot_testv_and_readv_and_writev messages # 4: as responses return, update share-dispatch table # 4a: may need to run recovery algorithm # 5: when enough responses are back, we're done self.log("starting publish, datalen is %s" % len(newdata)) self._status.set_size(len(newdata)) self._status.set_status("Started") self._started = time.time() self.done_deferred = defer.Deferred() self._writekey = self._node.get_writekey() assert self._writekey, "need write capability to publish" # first, which servers will we publish to? We require that the # servermap was updated in MODE_WRITE, so we can depend upon the # peerlist computed by that process instead of computing our own. if self._servermap: assert self._servermap.last_update_mode in (MODE_WRITE, MODE_CHECK) # we will push a version that is one larger than anything present # in the grid, according to the servermap. self._new_seqnum = self._servermap.highest_seqnum() + 1 else: # If we don't have a servermap, that's because we're doing the # initial publish self._new_seqnum = 1 self._servermap = ServerMap() self._status.set_servermap(self._servermap) self.log(format="new seqnum will be %(seqnum)d", seqnum=self._new_seqnum, level=log.NOISY) # having an up-to-date servermap (or using a filenode that was just # created for the first time) also guarantees that the following # fields are available self.readkey = self._node.get_readkey() self.required_shares = self._node.get_required_shares() assert self.required_shares is not None self.total_shares = self._node.get_total_shares() assert self.total_shares is not None self._status.set_encoding(self.required_shares, self.total_shares) self._pubkey = self._node.get_pubkey() assert self._pubkey self._privkey = self._node.get_privkey() assert self._privkey self._encprivkey = self._node.get_encprivkey() sb = self._storage_broker full_peerlist = sb.get_servers_for_index(self._storage_index) self.full_peerlist = full_peerlist # for use later, immutable self.bad_peers = set() # peerids who have errbacked/refused requests self.newdata = newdata self.salt = os.urandom(16) self.setup_encoding_parameters() # if we experience any surprises (writes which were rejected because # our test vector did not match, or shares which we didn't expect to # see), we set this flag and report an UncoordinatedWriteError at the # end of the publish process. self.surprised = False # as a failsafe, refuse to iterate through self.loop more than a # thousand times. self.looplimit = 1000 # we keep track of three tables. The first is our goal: which share # we want to see on which servers. This is initially populated by the # existing servermap. self.goal = set() # pairs of (peerid, shnum) tuples # the second table is our list of outstanding queries: those which # are in flight and may or may not be delivered, accepted, or # acknowledged. Items are added to this table when the request is # sent, and removed when the response returns (or errbacks). self.outstanding = set() # (peerid, shnum) tuples # the third is a table of successes: share which have actually been # placed. These are populated when responses come back with success. # When self.placed == self.goal, we're done. self.placed = set() # (peerid, shnum) tuples # we also keep a mapping from peerid to RemoteReference. Each time we # pull a connection out of the full peerlist, we add it to this for # use later. self.connections = {} self.bad_share_checkstrings = {} # we use the servermap to populate the initial goal: this way we will # try to update each existing share in place. for (peerid, shnum) in self._servermap.servermap: self.goal.add( (peerid, shnum) ) self.connections[peerid] = self._servermap.connections[peerid] # then we add in all the shares that were bad (corrupted, bad # signatures, etc). We want to replace these. for key, old_checkstring in self._servermap.bad_shares.items(): (peerid, shnum) = key self.goal.add(key) self.bad_share_checkstrings[key] = old_checkstring self.connections[peerid] = self._servermap.connections[peerid] # create the shares. We'll discard these as they are delivered. SDMF: # we're allowed to hold everything in memory. self._status.timings["setup"] = time.time() - self._started d = self._encrypt_and_encode() d.addCallback(self._generate_shares) def _start_pushing(res): self._started_pushing = time.time() return res d.addCallback(_start_pushing) d.addCallback(self.loop) # trigger delivery d.addErrback(self._fatal_error) return self.done_deferred