Esempio n. 1
0
def main():

    if len(sys.argv) == 1 or len(sys.argv) > 1 and (sys.argv[1] == "-h" or
                                                    sys.argv[1] == "--help"):
        printUsage()

    rootid = sys.argv[1]

    if not isValidUuid(rootid):
        print("Invalid root id!")
        sys.exit(1)

    if not isSchema2Id(rootid):
        print("This tool can only be used with Schema v2 ids")
        sys.exit(1)

    # we need to setup a asyncio loop to query s3
    loop = asyncio.get_event_loop()

    app = {}
    app["bucket_name"] = config.get("bucket_name")
    app["loop"] = loop
    session = get_session(loop=loop)
    app["session"] = session
    loop.run_until_complete(run_delete(app, rootid))

    loop.close()

    print("done!")
Esempio n. 2
0
def main():

    do_update = False

    if len(sys.argv) < 4:
        printUsage()

    rootid = sys.argv[1]
    prefix_old = sys.argv[2]
    prefix_new = sys.argv[3]
    if len(sys.argv) > 4 and sys.argv[4] == "-update":
        do_update = True

    if not isValidUuid(rootid):
        print("Invalid root id!")
        sys.exit(1)

    if not isSchema2Id(rootid):
        print("This tool can only be used with Schema v2 ids")
        sys.exit(1)

    if prefix_old == prefix_new:
        print("prefix_old and prefix_new or the same")
        sys.exit(1)

    # we need to setup a asyncio loop to query s3
    loop = asyncio.get_event_loop()

    app = {}
    app["bucket_name"] = config.get("bucket_name")
    app["prefix_old"] = prefix_old
    app["prefix_new"] = prefix_new
    app["do_update"] = do_update
    app["dataset_count"] = 0
    app["matched_dset_uri"] = 0
    app["indirect_dataset_keys"] = []
    app["loop"] = loop
    session = get_session()
    app["session"] = session
    app["filter_map"] = {}

    # need the metadata cache since we will be calling into some SN methods
    metadata_mem_cache_size = int(config.get("metadata_mem_cache_size"))
    app['meta_cache'] = LruCache(mem_target=metadata_mem_cache_size,
                                 name="MetaCache")

    loop.run_until_complete(run_scan(app, rootid=rootid, update=do_update))

    loop.close()

    print("datsets scanned:", app["dataset_count"])
    print(
        "datasets with matching uri ('H5D_CONTIGUOUS_REF', 'H5D_CHUNKED_REF' layouts):",
        app["matched_dset_uri"])

    print("done!")
Esempio n. 3
0
def main():

    if len(sys.argv) == 1 or len(sys.argv) > 1 and (sys.argv[1] == "-h" or sys.argv[1] == "--help"):
        printUsage()


    rootid = sys.argv[1]

    if len(sys.argv) > 2 and sys.argv[2] == "-update":
        do_update = True
    else:
        do_update = False

    if not isValidUuid(rootid):
        print("Invalid root id!")
        sys.exit(1)

    if not isSchema2Id(rootid):
        print("This tool can only be used with Schema v2 ids")
        sys.exit(1)


    # we need to setup a asyncio loop to query s3
    loop = asyncio.get_event_loop()

    app = {}
    app["bucket_name"] = config.get("bucket_name")
    app["loop"] = loop
    session = get_session(loop=loop)
    app["session"] = session
    loop.run_until_complete(run_scan(app, rootid=rootid, update=do_update))

    loop.close()

    results = app["scanRoot_results"]
    datasets = results["datasets"]
    lastModified = datetime.fromtimestamp(results["lastModified"])
    total_size  = results["metadata_bytes"] + results["allocated_bytes"]
    print(f"lastModified: {lastModified}")
    print(f"size: {total_size}")
    print(f"num chunks: {results['num_chunks']}")
    print(f"num_groups: {results['num_groups']}")
    print(f"num_datatypes: {results['num_datatypes']}")
    print(f"num_datasets: {len(datasets)}")
    for dsetid in datasets:
        dataset_info = datasets[dsetid]
        print(f"   {dsetid}: {dataset_info['lastModified']}, {dataset_info['num_chunks']}, {dataset_info['allocated_bytes']}")

    scan_start = datetime.fromtimestamp(results["scan_start"])
    print(f"scan_start: {scan_start}")
    scan_complete = datetime.fromtimestamp(results["scan_complete"])
    print(f"scan_complete: {scan_complete}")



    print("done!")
Esempio n. 4
0
async def bucketCheck(app):
    """ Verify that contents of bucket are self-consistent
    """

    now = int(time.time())
    log.info("bucket check {}".format(unixTimeToUTC(now)))

    # do initial listKeys
    await listKeys(app)

    # clear used flags
    clearUsedFlags(app)

    # mark objs
    await markObjs(app)

    unlinked_count = 0
    s3objs = app["s3objs"]
    for objid in s3objs:
        if isValidUuid(objid) and not isValidChunkId(objid):
            try:
                s3obj = await getS3Obj(app, objid)
                if s3obj.used is False:
                    unlinked_count += 1
            except HTTPInternalServerError as hpe:
                log.warn("got error retreiving {}: {}".format(objid, hpe.code))

    domains = app["domains"]
    for domain in domains:
        print("domain:", domain)
    roots = app["roots"]
    for root in roots:
        print("root:", root)

    top_level_domains = []
    for domain in domains:
        if domain[0] != '/':
            log.error("unexpected domain: {}".format(domain))
            continue
        if domain[1:].find('/') == -1:
            top_level_domains.append(domain)

    print("top-level-domains:")
    for domain in top_level_domains:
        print(domain)
    print("=" * 80)

    print("total storage: {}".format(app["bytes_in_bucket"]))
    print("Num objects: {}".format(len(app["s3objs"])))
    print("Num domains: {}".format(len(app["domains"])))
    print("Num root groups: {}".format(len(app["roots"])))
    print("Unlinked objects: {}".format(unlinked_count))
Esempio n. 5
0
    def testIsValidUuid(self):
        group_id = "g-314d61b8-9954-11e6-a733-3c15c2da029e"
        dataset_id = "d-4c48f3ae-9954-11e6-a3cd-3c15c2da029e"
        ctype_id = "t-8c785f1c-9953-11e6-9bc2-0242ac110005"
        chunk_id = "c-8c785f1c-9953-11e6-9bc2-0242ac110005_7_2"
        domain_id = "mybucket/bob/mydata.h5"
        valid_ids = (group_id, dataset_id, ctype_id, chunk_id, domain_id)
        bad_ids = ("g-1e76d862", "/bob/mydata.h5")

        self.assertTrue(isValidUuid(group_id))
        self.assertFalse(isSchema2Id(group_id))
        self.assertTrue(isValidUuid(group_id, obj_class="Group"))
        self.assertTrue(isValidUuid(group_id, obj_class="group"))
        self.assertTrue(isValidUuid(group_id, obj_class="groups"))
        self.assertTrue(isValidUuid(dataset_id, obj_class="datasets"))
        self.assertFalse(isSchema2Id(dataset_id))
        self.assertTrue(isValidUuid(ctype_id, obj_class="datatypes"))
        self.assertFalse(isSchema2Id(ctype_id))
        self.assertTrue(isValidUuid(chunk_id, obj_class="chunks"))
        self.assertFalse(isSchema2Id(chunk_id))
        validateUuid(group_id)
        try:
            isRootObjId(group_id)
            self.assertTrue(False)
        except ValueError:
            # only works for v2 schema
            pass  # expected

        for item in valid_ids:
            self.assertTrue(isObjId(item))
            s3key = getS3Key(item)
            self.assertTrue(s3key[0] != '/')
            self.assertTrue(isS3ObjKey(s3key))
            if item.find('/') > 0:
                continue  # bucket name gets lost when domain ids get converted to s3keys
            objid = getObjId(s3key)
            self.assertEqual(objid, item)
        for item in bad_ids:
            self.assertFalse(isValidUuid(item))
            self.assertFalse(isObjId(item))
Esempio n. 6
0
def main():
    if len(sys.argv) == 1 or sys.argv[1] == "-h" or sys.argv[1] == "--help":
        printUsage()
        sys.exit(1)

    obj_id = sys.argv[-1]
    if not isValidUuid(obj_id):
        print("Invalid obj id")

    # we need to setup a asyncio loop to query s3
    loop = asyncio.get_event_loop()
    session = get_session(loop=loop)

    app = {}
    app["session"] = session
    app['bucket_name'] = config.get("bucket_name")

    loop.run_until_complete(printS3Obj(app, obj_id))

    loop.close()
Esempio n. 7
0
def main():

    if len(sys.argv) == 1 or len(sys.argv) > 1 and (sys.argv[1] == "-h" or
                                                    sys.argv[1] == "--help"):
        printUsage()

    rootid = sys.argv[1]

    if len(sys.argv) > 2 and sys.argv[2] == "-update":
        do_update = True
    else:
        do_update = False

    if not isValidUuid(rootid):
        print("Invalid root id!")
        sys.exit(1)

    if not isSchema2Id(rootid):
        print("This tool can only be used with Schema v2 ids")
        sys.exit(1)

    # we need to setup a asyncio loop to query s3
    loop = asyncio.get_event_loop()

    app = {}
    app["bucket_name"] = config.get("bucket_name")
    app["loop"] = loop
    session = get_session()
    app["session"] = session
    app["filter_map"] = {}

    # need the metadata cache since we will be calling into some SN methods
    metadata_mem_cache_size = int(config.get("metadata_mem_cache_size"))
    app['meta_cache'] = LruCache(mem_target=metadata_mem_cache_size,
                                 name="MetaCache")

    loop.run_until_complete(run_scan(app, rootid=rootid, update=do_update))

    loop.close()

    results = app["scanRoot_results"]
    datasets = results["datasets"]
    lastModified = datetime.fromtimestamp(results["lastModified"])
    print(f"lastModified: {lastModified}")
    if "md5_sum" in results:
        checksum = results["md5_sum"]
        print(f"md5_sum: {checksum}")
    print(f"metadata bytes: {results['metadata_bytes']}")
    print(f"allocated bytes: {results['allocated_bytes']}")
    print(f"logical bytes: {results['logical_bytes']}")
    print(f"num chunks: {results['num_chunks']}")
    print(f"linked chunks: {results['num_linked_chunks']}")
    print(f"linked bytes: {results['linked_bytes']}")
    print(f"num_groups: {results['num_groups']}")
    print(f"num_datatypes: {results['num_datatypes']}")
    print(f"num_datasets: {len(datasets)}")
    if datasets:
        print(
            "    dataset_id\tlast_modified\tnum_chunks\tallocated_bytes\tlogical_bytes\tlinked_bytes\tnum_link_chunks"
        )
    for dsetid in datasets:
        dataset_info = datasets[dsetid]
        lm = dataset_info['lastModified']
        nc = dataset_info['num_chunks']
        ab = dataset_info['allocated_bytes']
        lb = dataset_info['logical_bytes']
        ln = dataset_info['linked_bytes']
        nl = dataset_info['num_linked_chunks']
        print(f"   {dsetid}: {lm}, {nc}, {ab}, {lb}, {ln}, {nl}")

    scan_start = datetime.fromtimestamp(results["scan_start"])
    print(f"scan_start:    {scan_start}")
    scan_complete = datetime.fromtimestamp(results["scan_complete"])
    print(f"scan_complete: {scan_complete}")

    print("done!")