Beispiel #1
0
def main():

    if len(sys.argv) == 1 or len(sys.argv) > 1 and (sys.argv[1] == "-h" or
                                                    sys.argv[1] == "--help"):
        printUsage()

    rootid = sys.argv[1]

    if not isValidUuid(rootid):
        print("Invalid root id!")
        sys.exit(1)

    if not isSchema2Id(rootid):
        print("This tool can only be used with Schema v2 ids")
        sys.exit(1)

    # we need to setup a asyncio loop to query s3
    loop = asyncio.get_event_loop()

    app = {}
    app["bucket_name"] = config.get("bucket_name")
    app["loop"] = loop
    session = get_session(loop=loop)
    app["session"] = session
    loop.run_until_complete(run_delete(app, rootid))

    loop.close()

    print("done!")
Beispiel #2
0
def main():

    do_update = False

    if len(sys.argv) < 4:
        printUsage()

    rootid = sys.argv[1]
    prefix_old = sys.argv[2]
    prefix_new = sys.argv[3]
    if len(sys.argv) > 4 and sys.argv[4] == "-update":
        do_update = True

    if not isValidUuid(rootid):
        print("Invalid root id!")
        sys.exit(1)

    if not isSchema2Id(rootid):
        print("This tool can only be used with Schema v2 ids")
        sys.exit(1)

    if prefix_old == prefix_new:
        print("prefix_old and prefix_new or the same")
        sys.exit(1)

    # we need to setup a asyncio loop to query s3
    loop = asyncio.get_event_loop()

    app = {}
    app["bucket_name"] = config.get("bucket_name")
    app["prefix_old"] = prefix_old
    app["prefix_new"] = prefix_new
    app["do_update"] = do_update
    app["dataset_count"] = 0
    app["matched_dset_uri"] = 0
    app["indirect_dataset_keys"] = []
    app["loop"] = loop
    session = get_session()
    app["session"] = session
    app["filter_map"] = {}

    # need the metadata cache since we will be calling into some SN methods
    metadata_mem_cache_size = int(config.get("metadata_mem_cache_size"))
    app['meta_cache'] = LruCache(mem_target=metadata_mem_cache_size,
                                 name="MetaCache")

    loop.run_until_complete(run_scan(app, rootid=rootid, update=do_update))

    loop.close()

    print("datsets scanned:", app["dataset_count"])
    print(
        "datasets with matching uri ('H5D_CONTIGUOUS_REF', 'H5D_CHUNKED_REF' layouts):",
        app["matched_dset_uri"])

    print("done!")
Beispiel #3
0
def main():

    if len(sys.argv) == 1 or len(sys.argv) > 1 and (sys.argv[1] == "-h" or sys.argv[1] == "--help"):
        printUsage()


    rootid = sys.argv[1]

    if len(sys.argv) > 2 and sys.argv[2] == "-update":
        do_update = True
    else:
        do_update = False

    if not isValidUuid(rootid):
        print("Invalid root id!")
        sys.exit(1)

    if not isSchema2Id(rootid):
        print("This tool can only be used with Schema v2 ids")
        sys.exit(1)


    # we need to setup a asyncio loop to query s3
    loop = asyncio.get_event_loop()

    app = {}
    app["bucket_name"] = config.get("bucket_name")
    app["loop"] = loop
    session = get_session(loop=loop)
    app["session"] = session
    loop.run_until_complete(run_scan(app, rootid=rootid, update=do_update))

    loop.close()

    results = app["scanRoot_results"]
    datasets = results["datasets"]
    lastModified = datetime.fromtimestamp(results["lastModified"])
    total_size  = results["metadata_bytes"] + results["allocated_bytes"]
    print(f"lastModified: {lastModified}")
    print(f"size: {total_size}")
    print(f"num chunks: {results['num_chunks']}")
    print(f"num_groups: {results['num_groups']}")
    print(f"num_datatypes: {results['num_datatypes']}")
    print(f"num_datasets: {len(datasets)}")
    for dsetid in datasets:
        dataset_info = datasets[dsetid]
        print(f"   {dsetid}: {dataset_info['lastModified']}, {dataset_info['num_chunks']}, {dataset_info['allocated_bytes']}")

    scan_start = datetime.fromtimestamp(results["scan_start"])
    print(f"scan_start: {scan_start}")
    scan_complete = datetime.fromtimestamp(results["scan_complete"])
    print(f"scan_complete: {scan_complete}")



    print("done!")
Beispiel #4
0
    def testIsValidUuid(self):
        group_id = "g-314d61b8-9954-11e6-a733-3c15c2da029e"
        dataset_id = "d-4c48f3ae-9954-11e6-a3cd-3c15c2da029e"
        ctype_id = "t-8c785f1c-9953-11e6-9bc2-0242ac110005"
        chunk_id = "c-8c785f1c-9953-11e6-9bc2-0242ac110005_7_2"
        domain_id = "mybucket/bob/mydata.h5"
        valid_ids = (group_id, dataset_id, ctype_id, chunk_id, domain_id)
        bad_ids = ("g-1e76d862", "/bob/mydata.h5")

        self.assertTrue(isValidUuid(group_id))
        self.assertFalse(isSchema2Id(group_id))
        self.assertTrue(isValidUuid(group_id, obj_class="Group"))
        self.assertTrue(isValidUuid(group_id, obj_class="group"))
        self.assertTrue(isValidUuid(group_id, obj_class="groups"))
        self.assertTrue(isValidUuid(dataset_id, obj_class="datasets"))
        self.assertFalse(isSchema2Id(dataset_id))
        self.assertTrue(isValidUuid(ctype_id, obj_class="datatypes"))
        self.assertFalse(isSchema2Id(ctype_id))
        self.assertTrue(isValidUuid(chunk_id, obj_class="chunks"))
        self.assertFalse(isSchema2Id(chunk_id))
        validateUuid(group_id)
        try:
            isRootObjId(group_id)
            self.assertTrue(False)
        except ValueError:
            # only works for v2 schema
            pass  # expected

        for item in valid_ids:
            self.assertTrue(isObjId(item))
            s3key = getS3Key(item)
            self.assertTrue(s3key[0] != '/')
            self.assertTrue(isS3ObjKey(s3key))
            if item.find('/') > 0:
                continue  # bucket name gets lost when domain ids get converted to s3keys
            objid = getObjId(s3key)
            self.assertEqual(objid, item)
        for item in bad_ids:
            self.assertFalse(isValidUuid(item))
            self.assertFalse(isObjId(item))
Beispiel #5
0
    def testSchema2Id(self):
        root_id = createObjId("roots")
        group_id = createObjId("groups", rootid=root_id)
        dataset_id = createObjId("datasets", rootid=root_id)
        ctype_id = createObjId("datatypes", rootid=root_id)

        self.assertEqual(getCollectionForId(root_id), "groups")
        self.assertEqual(getCollectionForId(group_id), "groups")
        self.assertEqual(getCollectionForId(dataset_id), "datasets")
        self.assertEqual(getCollectionForId(ctype_id), "datatypes")
        chunk_id = 'c' + dataset_id[1:] + "_1_2"
        print(chunk_id)
        chunk_partition_id = 'c42-' + dataset_id[2:] + "_1_2"

        for id in (chunk_id, chunk_partition_id):
            try:
                getCollectionForId(id)
                self.assertTrue(False)
            except ValueError:
                pass  # expected
        valid_ids = (group_id, dataset_id, ctype_id, chunk_id,
                     chunk_partition_id, root_id)
        s3prefix = getS3Key(root_id)
        self.assertTrue(s3prefix.endswith("/.group.json"))
        s3prefix = s3prefix[:-(len(".group.json"))]
        for oid in valid_ids:
            print("oid:", oid)
            self.assertTrue(len(oid) >= 38)
            parts = oid.split('-')
            self.assertEqual(len(parts), 6)
            self.assertTrue(oid[0] in ('g', 'd', 't', 'c'))
            self.assertTrue(isSchema2Id(oid))
            if oid == root_id:
                self.assertTrue(isRootObjId(oid))
            else:
                self.assertFalse(isRootObjId(oid))
            self.assertEqual(getRootObjId(oid), root_id)

            s3key = getS3Key(oid)
            print(s3key)
            self.assertTrue(s3key.startswith(s3prefix))
            self.assertEqual(getObjId(s3key), oid)
            self.assertTrue(isS3ObjKey(s3key))
Beispiel #6
0
def main():

    if len(sys.argv) == 1 or len(sys.argv) > 1 and (sys.argv[1] == "-h" or
                                                    sys.argv[1] == "--help"):
        printUsage()

    rootid = sys.argv[1]

    if len(sys.argv) > 2 and sys.argv[2] == "-update":
        do_update = True
    else:
        do_update = False

    if not isValidUuid(rootid):
        print("Invalid root id!")
        sys.exit(1)

    if not isSchema2Id(rootid):
        print("This tool can only be used with Schema v2 ids")
        sys.exit(1)

    # we need to setup a asyncio loop to query s3
    loop = asyncio.get_event_loop()

    app = {}
    app["bucket_name"] = config.get("bucket_name")
    app["loop"] = loop
    session = get_session()
    app["session"] = session
    app["filter_map"] = {}

    # need the metadata cache since we will be calling into some SN methods
    metadata_mem_cache_size = int(config.get("metadata_mem_cache_size"))
    app['meta_cache'] = LruCache(mem_target=metadata_mem_cache_size,
                                 name="MetaCache")

    loop.run_until_complete(run_scan(app, rootid=rootid, update=do_update))

    loop.close()

    results = app["scanRoot_results"]
    datasets = results["datasets"]
    lastModified = datetime.fromtimestamp(results["lastModified"])
    print(f"lastModified: {lastModified}")
    if "md5_sum" in results:
        checksum = results["md5_sum"]
        print(f"md5_sum: {checksum}")
    print(f"metadata bytes: {results['metadata_bytes']}")
    print(f"allocated bytes: {results['allocated_bytes']}")
    print(f"logical bytes: {results['logical_bytes']}")
    print(f"num chunks: {results['num_chunks']}")
    print(f"linked chunks: {results['num_linked_chunks']}")
    print(f"linked bytes: {results['linked_bytes']}")
    print(f"num_groups: {results['num_groups']}")
    print(f"num_datatypes: {results['num_datatypes']}")
    print(f"num_datasets: {len(datasets)}")
    if datasets:
        print(
            "    dataset_id\tlast_modified\tnum_chunks\tallocated_bytes\tlogical_bytes\tlinked_bytes\tnum_link_chunks"
        )
    for dsetid in datasets:
        dataset_info = datasets[dsetid]
        lm = dataset_info['lastModified']
        nc = dataset_info['num_chunks']
        ab = dataset_info['allocated_bytes']
        lb = dataset_info['logical_bytes']
        ln = dataset_info['linked_bytes']
        nl = dataset_info['num_linked_chunks']
        print(f"   {dsetid}: {lm}, {nc}, {ab}, {lb}, {ln}, {nl}")

    scan_start = datetime.fromtimestamp(results["scan_start"])
    print(f"scan_start:    {scan_start}")
    scan_complete = datetime.fromtimestamp(results["scan_complete"])
    print(f"scan_complete: {scan_complete}")

    print("done!")