sc.addFile("/nfs/paper-big-data-engines/utils.py") sc.addFile("/nfs/paper-big-data-engines/histogram/Histogram.py") from utils import benchmark, crawl_dir, read_img from Histogram import ( calculate_histogram, combine_histogram, flatten, save_histogram, ) print("Connected") # Read images paths = crawl_dir(os.path.abspath(args.bb_dir)) paths = sc.parallelize(paths, len(paths)) img_rdd = paths.map(lambda p: read_img(p, start=start, args=args)) img_rdd = img_rdd.map( lambda x: flatten(x[1], start=start, args=args, filename=x[0])) partial_histogram = img_rdd.map(lambda x: calculate_histogram( x[1], args=args, start=start, filename=x[0])) histogram = partial_histogram.fold( np.array([0] * (2**16 - 1)), lambda x, y: combine_histogram(x, y, args=args, start=start), ) save_histogram(histogram, args=args, start=start)
cluster = args.scheduler client = Client(cluster) print(client) # Allow workers to use module client.upload_file("/nfs/paper-big-data-engines/utils.py") client.upload_file("/nfs/paper-big-data-engines/histogram/Histogram.py") from utils import benchmark, crawl_dir, read_img from Histogram import ( calculate_histogram, combine_histogram, flatten, save_histogram, ) # Read images paths = crawl_dir(os.path.abspath(args.bb_dir)) paths = db.from_sequence(paths, npartitions=len(paths)) img = paths.map(lambda p: read_img(p, start=start, args=args)) img = img.map( lambda x: flatten(x[1], start=start, args=args, filename=x[0])) partial_histogram = img.map(lambda x: calculate_histogram( x[1], args=args, start=start, filename=x[0])) histogram = partial_histogram.fold(lambda x, y: combine_histogram( x, y, args=args, start=start)).compute() save_histogram(histogram, args=args, start=start)