def test_probe_waste_adding_one_server(self):
     hashes = [
         hashlib.md5("{:064x}".format(x)).hexdigest() for x in range(100)
     ]
     initial_services = 12
     self.api_client = self.mock_keep_services(count=initial_services)
     self.keep_client = arvados.KeepClient(api_client=self.api_client)
     probes_before = [
         self.keep_client.weighted_service_roots(arvados.KeepLocator(hash))
         for hash in hashes
     ]
     for added_services in range(1, 12):
         api_client = self.mock_keep_services(count=initial_services +
                                              added_services)
         keep_client = arvados.KeepClient(api_client=api_client)
         total_penalty = 0
         for hash_index in range(len(hashes)):
             probe_after = keep_client.weighted_service_roots(
                 arvados.KeepLocator(hashes[hash_index]))
             penalty = probe_after.index(probes_before[hash_index][0])
             self.assertLessEqual(penalty, added_services)
             total_penalty += penalty
         # Average penalty per block should not exceed
         # N(added)/N(orig) by more than 20%, and should get closer
         # to the ideal as we add data points.
         expect_penalty = (added_services * len(hashes) / initial_services)
         max_penalty = (expect_penalty * (120 - added_services) / 100)
         min_penalty = (expect_penalty * 8 / 10)
         self.assertTrue(
             min_penalty <= total_penalty <= max_penalty,
             "With {}+{} services, {} blocks, penalty {} but expected {}..{}"
             .format(initial_services, added_services, len(hashes),
                     total_penalty, min_penalty, max_penalty))
Beispiel #2
0
 def test_probe_order_reference_set(self):
     # expected_order[i] is the probe order for
     # hash=md5(sprintf("%064x",i)) where there are 16 services
     # with uuid sprintf("anything-%015x",j) with j in 0..15. E.g.,
     # the first probe for the block consisting of 64 "0"
     # characters is the service whose uuid is
     # "zzzzz-bi6l4-000000000000003", so expected_order[0][0]=='3'.
     expected_order = [
         list('3eab2d5fc9681074'),
         list('097dba52e648f1c3'),
         list('c5b4e023f8a7d691'),
         list('9d81c02e76a3bf54'),
     ]
     hashes = [
         hashlib.md5("{:064x}".format(x)).hexdigest()
         for x in range(len(expected_order))
     ]
     api_client = self.mock_keep_services(count=16)
     keep_client = arvados.KeepClient(api_client=api_client)
     for i, hash in enumerate(hashes):
         roots = keep_client.weighted_service_roots(
             arvados.KeepLocator(hash))
         got_order = [
             re.search(r'//\[?keep0x([0-9a-f]+)', root).group(1)
             for root in roots
         ]
         self.assertEqual(expected_order[i], got_order)
 def test_weighted_service_roots_against_reference_set(self):
     # Confirm weighted_service_roots() returns the correct order
     for i, hash in enumerate(self.hashes):
         roots = self.keep_client.weighted_service_roots(
             arvados.KeepLocator(hash))
         got_order = [
             re.search(r'//\[?keep0x([0-9a-f]+)', root).group(1)
             for root in roots
         ]
         self.assertEqual(self.expected_order[i], got_order)
Beispiel #4
0
def total_collection_size(manifest_text):
    """Return the total number of bytes in this collection (excluding
    duplicate blocks)."""

    total_bytes = 0
    locators_seen = {}
    for line in manifest_text.splitlines():
        words = line.split()
        for word in words[1:]:
            try:
                loc = arvados.KeepLocator(word)
            except ValueError:
                continue  # this word isn't a locator, skip it
            if loc.md5sum not in locators_seen:
                locators_seen[loc.md5sum] = True
                total_bytes += loc.size

    return total_bytes
Beispiel #5
0
def copy_collection(obj_uuid, src, dst, args):
    if arvados.util.keep_locator_pattern.match(obj_uuid):
        # If the obj_uuid is a portable data hash, it might not be
        # uniquely identified with a particular collection.  As a
        # result, it is ambiguous as to what name to use for the copy.
        # Apply some heuristics to pick which collection to get the
        # name from.
        srccol = src.collections().list(
            filters=[['portable_data_hash', '=', obj_uuid]],
            order="created_at asc"
            ).execute(num_retries=args.retries)

        items = srccol.get("items")

        if not items:
            logger.warning("Could not find collection with portable data hash %s", obj_uuid)
            return

        c = None

        if len(items) == 1:
            # There's only one collection with the PDH, so use that.
            c = items[0]
        if not c:
            # See if there is a collection that's in the same project
            # as the root item (usually a workflow) being copied.
            for i in items:
                if i.get("owner_uuid") == src_owner_uuid and i.get("name"):
                    c = i
                    break
        if not c:
            # Didn't find any collections located in the same project, so
            # pick the oldest collection that has a name assigned to it.
            for i in items:
                if i.get("name"):
                    c = i
                    break
        if not c:
            # None of the collections have names (?!), so just pick the
            # first one.
            c = items[0]

        # list() doesn't return manifest text (and we don't want it to,
        # because we don't need the same maninfest text sent to us 50
        # times) so go and retrieve the collection object directly
        # which will include the manifest text.
        c = src.collections().get(uuid=c["uuid"]).execute(num_retries=args.retries)
    else:
        # Assume this is an actual collection uuid, so fetch it directly.
        c = src.collections().get(uuid=obj_uuid).execute(num_retries=args.retries)

    # If a collection with this hash already exists at the
    # destination, and 'force' is not true, just return that
    # collection.
    if not args.force:
        if 'portable_data_hash' in c:
            colhash = c['portable_data_hash']
        else:
            colhash = c['uuid']
        dstcol = dst.collections().list(
            filters=[['portable_data_hash', '=', colhash]]
        ).execute(num_retries=args.retries)
        if dstcol['items_available'] > 0:
            for d in dstcol['items']:
                if ((args.project_uuid == d['owner_uuid']) and
                    (c.get('name') == d['name']) and
                    (c['portable_data_hash'] == d['portable_data_hash'])):
                    return d
            c['manifest_text'] = dst.collections().get(
                uuid=dstcol['items'][0]['uuid']
            ).execute(num_retries=args.retries)['manifest_text']
            return create_collection_from(c, src, dst, args)

    # Fetch the collection's manifest.
    manifest = c['manifest_text']
    logger.debug("Copying collection %s with manifest: <%s>", obj_uuid, manifest)

    # Copy each block from src_keep to dst_keep.
    # Use the newly signed locators returned from dst_keep to build
    # a new manifest as we go.
    src_keep = arvados.keep.KeepClient(api_client=src, num_retries=args.retries)
    dst_keep = arvados.keep.KeepClient(api_client=dst, num_retries=args.retries)
    dst_manifest = io.StringIO()
    dst_locators = {}
    bytes_written = 0
    bytes_expected = total_collection_size(manifest)
    if args.progress:
        progress_writer = ProgressWriter(human_progress)
    else:
        progress_writer = None

    for line in manifest.splitlines():
        words = line.split()
        dst_manifest.write(words[0])
        for word in words[1:]:
            try:
                loc = arvados.KeepLocator(word)
            except ValueError:
                # If 'word' can't be parsed as a locator,
                # presume it's a filename.
                dst_manifest.write(' ')
                dst_manifest.write(word)
                continue
            blockhash = loc.md5sum
            # copy this block if we haven't seen it before
            # (otherwise, just reuse the existing dst_locator)
            if blockhash not in dst_locators:
                logger.debug("Copying block %s (%s bytes)", blockhash, loc.size)
                if progress_writer:
                    progress_writer.report(obj_uuid, bytes_written, bytes_expected)
                data = src_keep.get(word)
                dst_locator = dst_keep.put(data)
                dst_locators[blockhash] = dst_locator
                bytes_written += loc.size
            dst_manifest.write(' ')
            dst_manifest.write(dst_locators[blockhash])
        dst_manifest.write("\n")

    if progress_writer:
        progress_writer.report(obj_uuid, bytes_written, bytes_expected)
        progress_writer.finish()

    # Copy the manifest and save the collection.
    logger.debug('saving %s with manifest: <%s>', obj_uuid, dst_manifest.getvalue())

    c['manifest_text'] = dst_manifest.getvalue()
    return create_collection_from(c, src, dst, args)
 def get_service_roots(self, api_client):
     keep_client = arvados.KeepClient(api_client=api_client)
     services = keep_client.weighted_service_roots(
         arvados.KeepLocator('0' * 32))
     return [urlparse.urlparse(url) for url in sorted(services)]