def test_probe_waste_adding_one_server(self): hashes = [ hashlib.md5("{:064x}".format(x)).hexdigest() for x in range(100) ] initial_services = 12 self.api_client = self.mock_keep_services(count=initial_services) self.keep_client = arvados.KeepClient(api_client=self.api_client) probes_before = [ self.keep_client.weighted_service_roots(arvados.KeepLocator(hash)) for hash in hashes ] for added_services in range(1, 12): api_client = self.mock_keep_services(count=initial_services + added_services) keep_client = arvados.KeepClient(api_client=api_client) total_penalty = 0 for hash_index in range(len(hashes)): probe_after = keep_client.weighted_service_roots( arvados.KeepLocator(hashes[hash_index])) penalty = probe_after.index(probes_before[hash_index][0]) self.assertLessEqual(penalty, added_services) total_penalty += penalty # Average penalty per block should not exceed # N(added)/N(orig) by more than 20%, and should get closer # to the ideal as we add data points. expect_penalty = (added_services * len(hashes) / initial_services) max_penalty = (expect_penalty * (120 - added_services) / 100) min_penalty = (expect_penalty * 8 / 10) self.assertTrue( min_penalty <= total_penalty <= max_penalty, "With {}+{} services, {} blocks, penalty {} but expected {}..{}" .format(initial_services, added_services, len(hashes), total_penalty, min_penalty, max_penalty))
def test_probe_order_reference_set(self): # expected_order[i] is the probe order for # hash=md5(sprintf("%064x",i)) where there are 16 services # with uuid sprintf("anything-%015x",j) with j in 0..15. E.g., # the first probe for the block consisting of 64 "0" # characters is the service whose uuid is # "zzzzz-bi6l4-000000000000003", so expected_order[0][0]=='3'. expected_order = [ list('3eab2d5fc9681074'), list('097dba52e648f1c3'), list('c5b4e023f8a7d691'), list('9d81c02e76a3bf54'), ] hashes = [ hashlib.md5("{:064x}".format(x)).hexdigest() for x in range(len(expected_order)) ] api_client = self.mock_keep_services(count=16) keep_client = arvados.KeepClient(api_client=api_client) for i, hash in enumerate(hashes): roots = keep_client.weighted_service_roots( arvados.KeepLocator(hash)) got_order = [ re.search(r'//\[?keep0x([0-9a-f]+)', root).group(1) for root in roots ] self.assertEqual(expected_order[i], got_order)
def test_weighted_service_roots_against_reference_set(self): # Confirm weighted_service_roots() returns the correct order for i, hash in enumerate(self.hashes): roots = self.keep_client.weighted_service_roots( arvados.KeepLocator(hash)) got_order = [ re.search(r'//\[?keep0x([0-9a-f]+)', root).group(1) for root in roots ] self.assertEqual(self.expected_order[i], got_order)
def total_collection_size(manifest_text): """Return the total number of bytes in this collection (excluding duplicate blocks).""" total_bytes = 0 locators_seen = {} for line in manifest_text.splitlines(): words = line.split() for word in words[1:]: try: loc = arvados.KeepLocator(word) except ValueError: continue # this word isn't a locator, skip it if loc.md5sum not in locators_seen: locators_seen[loc.md5sum] = True total_bytes += loc.size return total_bytes
def copy_collection(obj_uuid, src, dst, args): if arvados.util.keep_locator_pattern.match(obj_uuid): # If the obj_uuid is a portable data hash, it might not be # uniquely identified with a particular collection. As a # result, it is ambiguous as to what name to use for the copy. # Apply some heuristics to pick which collection to get the # name from. srccol = src.collections().list( filters=[['portable_data_hash', '=', obj_uuid]], order="created_at asc" ).execute(num_retries=args.retries) items = srccol.get("items") if not items: logger.warning("Could not find collection with portable data hash %s", obj_uuid) return c = None if len(items) == 1: # There's only one collection with the PDH, so use that. c = items[0] if not c: # See if there is a collection that's in the same project # as the root item (usually a workflow) being copied. for i in items: if i.get("owner_uuid") == src_owner_uuid and i.get("name"): c = i break if not c: # Didn't find any collections located in the same project, so # pick the oldest collection that has a name assigned to it. for i in items: if i.get("name"): c = i break if not c: # None of the collections have names (?!), so just pick the # first one. c = items[0] # list() doesn't return manifest text (and we don't want it to, # because we don't need the same maninfest text sent to us 50 # times) so go and retrieve the collection object directly # which will include the manifest text. c = src.collections().get(uuid=c["uuid"]).execute(num_retries=args.retries) else: # Assume this is an actual collection uuid, so fetch it directly. c = src.collections().get(uuid=obj_uuid).execute(num_retries=args.retries) # If a collection with this hash already exists at the # destination, and 'force' is not true, just return that # collection. if not args.force: if 'portable_data_hash' in c: colhash = c['portable_data_hash'] else: colhash = c['uuid'] dstcol = dst.collections().list( filters=[['portable_data_hash', '=', colhash]] ).execute(num_retries=args.retries) if dstcol['items_available'] > 0: for d in dstcol['items']: if ((args.project_uuid == d['owner_uuid']) and (c.get('name') == d['name']) and (c['portable_data_hash'] == d['portable_data_hash'])): return d c['manifest_text'] = dst.collections().get( uuid=dstcol['items'][0]['uuid'] ).execute(num_retries=args.retries)['manifest_text'] return create_collection_from(c, src, dst, args) # Fetch the collection's manifest. manifest = c['manifest_text'] logger.debug("Copying collection %s with manifest: <%s>", obj_uuid, manifest) # Copy each block from src_keep to dst_keep. # Use the newly signed locators returned from dst_keep to build # a new manifest as we go. src_keep = arvados.keep.KeepClient(api_client=src, num_retries=args.retries) dst_keep = arvados.keep.KeepClient(api_client=dst, num_retries=args.retries) dst_manifest = io.StringIO() dst_locators = {} bytes_written = 0 bytes_expected = total_collection_size(manifest) if args.progress: progress_writer = ProgressWriter(human_progress) else: progress_writer = None for line in manifest.splitlines(): words = line.split() dst_manifest.write(words[0]) for word in words[1:]: try: loc = arvados.KeepLocator(word) except ValueError: # If 'word' can't be parsed as a locator, # presume it's a filename. dst_manifest.write(' ') dst_manifest.write(word) continue blockhash = loc.md5sum # copy this block if we haven't seen it before # (otherwise, just reuse the existing dst_locator) if blockhash not in dst_locators: logger.debug("Copying block %s (%s bytes)", blockhash, loc.size) if progress_writer: progress_writer.report(obj_uuid, bytes_written, bytes_expected) data = src_keep.get(word) dst_locator = dst_keep.put(data) dst_locators[blockhash] = dst_locator bytes_written += loc.size dst_manifest.write(' ') dst_manifest.write(dst_locators[blockhash]) dst_manifest.write("\n") if progress_writer: progress_writer.report(obj_uuid, bytes_written, bytes_expected) progress_writer.finish() # Copy the manifest and save the collection. logger.debug('saving %s with manifest: <%s>', obj_uuid, dst_manifest.getvalue()) c['manifest_text'] = dst_manifest.getvalue() return create_collection_from(c, src, dst, args)
def get_service_roots(self, api_client): keep_client = arvados.KeepClient(api_client=api_client) services = keep_client.weighted_service_roots( arvados.KeepLocator('0' * 32)) return [urlparse.urlparse(url) for url in sorted(services)]