sc = SparkContext.getOrCreate(conf=conf) sc.addFile("/nfs/paper-big-data-engines/utils.py") sc.addFile("/nfs/paper-big-data-engines/histogram/Histogram.py") from utils import benchmark, crawl_dir, read_img from Histogram import ( calculate_histogram, combine_histogram, flatten, save_histogram, ) print("Connected") # Read images paths = crawl_dir(os.path.abspath(args.bb_dir)) paths = sc.parallelize(paths, len(paths)) img_rdd = paths.map(lambda p: read_img(p, start=start, args=args)) img_rdd = img_rdd.map( lambda x: flatten(x[1], start=start, args=args, filename=x[0])) partial_histogram = img_rdd.map(lambda x: calculate_histogram( x[1], args=args, start=start, filename=x[0])) histogram = partial_histogram.fold( np.array([0] * (2**16 - 1)), lambda x, y: combine_histogram(x, y, args=args, start=start), ) save_histogram(histogram, args=args, start=start)
action="store_true", help="benchmark results") args = parser.parse_args() # Cluster scheduler cluster = args.scheduler client = Client(cluster) print(client) client.upload_file( "nfs/SOEN-499-Project/utils.py") # Allow workers to use module client.upload_file("nfs/SOEN-499-Project/kmeans/Kmeans.py") # Read images paths = crawl_dir(os.path.abspath("test/data")) img = [read_img(path, start=start, args=args) for path in paths] voxels = da.concatenate([x[1] for x in img]).reshape(-1) centroids = [0.0, 125.8, 251.6, 377.4] # Initial centroids voxel_pair = None bincount = da.bincount(voxels) bincount = bincount[bincount != 0] unique = da.unique(voxels) unique, counts = dask.compute(unique, bincount) for i in range(0, args.iterations): # Disregard convergence.