コード例 #1
0
    sc.addFile("/nfs/paper-big-data-engines/utils.py")
    sc.addFile("/nfs/paper-big-data-engines/histogram/Histogram.py")
    from utils import benchmark, crawl_dir, read_img
    from Histogram import (
        calculate_histogram,
        combine_histogram,
        flatten,
        save_histogram,
    )

    print("Connected")

    # Read images
    paths = crawl_dir(os.path.abspath(args.bb_dir))
    paths = sc.parallelize(paths, len(paths))
    img_rdd = paths.map(lambda p: read_img(p, start=start, args=args))

    img_rdd = img_rdd.map(
        lambda x: flatten(x[1], start=start, args=args, filename=x[0]))

    partial_histogram = img_rdd.map(lambda x: calculate_histogram(
        x[1], args=args, start=start, filename=x[0]))

    histogram = partial_histogram.fold(
        np.array([0] * (2**16 - 1)),
        lambda x, y: combine_histogram(x, y, args=args, start=start),
    )

    save_histogram(histogram, args=args, start=start)
コード例 #2
0
    cluster = args.scheduler
    client = Client(cluster)

    print(client)
    # Allow workers to use module
    client.upload_file("/nfs/paper-big-data-engines/utils.py")
    client.upload_file("/nfs/paper-big-data-engines/histogram/Histogram.py")
    from utils import benchmark, crawl_dir, read_img
    from Histogram import (
        calculate_histogram,
        combine_histogram,
        flatten,
        save_histogram,
    )

    # Read images
    paths = crawl_dir(os.path.abspath(args.bb_dir))
    paths = db.from_sequence(paths, npartitions=len(paths))
    img = paths.map(lambda p: read_img(p, start=start, args=args))

    img = img.map(
        lambda x: flatten(x[1], start=start, args=args, filename=x[0]))

    partial_histogram = img.map(lambda x: calculate_histogram(
        x[1], args=args, start=start, filename=x[0]))

    histogram = partial_histogram.fold(lambda x, y: combine_histogram(
        x, y, args=args, start=start)).compute()

    save_histogram(histogram, args=args, start=start)