def run_task_bundle(settings, layer, roi): cgraph = chunkedgraph.ChunkedGraph( table_id=settings['chunkedgraph']['table_id'], instance_id=settings['chunkedgraph']['instance_id']) meshing = settings['meshing'] mip = meshing.get('mip', 2) max_err = meshing.get('max_simplification_error', 40) mesh_dir = meshing.get('mesh_dir', None) base_chunk_span = int(cgraph.fan_out)**max(0, layer - 2) chunksize = np.array(cgraph.chunk_size, dtype=np.int) * base_chunk_span for x in range(roi[0].start, roi[0].stop, chunksize[0]): for y in range(roi[1].start, roi[1].stop, chunksize[1]): for z in range(roi[2].start, roi[2].stop, chunksize[2]): chunk_id = cgraph.get_chunk_id_from_coord(layer, x, y, z) try: chunk_mesh_task(cgraph, chunk_id, cgraph._cv_path, cv_mesh_dir=mesh_dir, mip=mip, max_err=max_err) except EmptyVolumeException as e: print("Warning: Empty segmentation encountered: %s" % e)
def _family_consistency_test_thread(args): """ Helper to test family consistency """ table_id, coord, layer_id = args x, y, z = coord cg = chunkedgraph.ChunkedGraph(table_id) rows = cg.range_read_chunk(layer_id, x, y, z) failed_node_ids = [] time_start = time.time() for i_k, k in enumerate(rows.keys()): if i_k % 100 == 1: dt = time.time() - time_start eta = dt / i_k * len(rows) - dt print("%d / %d - %.3fs -> %.3fs " % (i_k, len(rows), dt, eta), end="\r") node_id = chunkedgraph.deserialize_uint64(k) parent_id = np.frombuffer(rows[k].cells["0"][b'parents'][0].value, dtype=np.uint64) if not node_id in cg.get_children(parent_id): failed_node_ids.append([node_id, parent_id]) return failed_node_ids
def _count_and_download_nodes(args): serialized_cg_info, chunk_coords = args time_start = time.time() cg = chunkedgraph.ChunkedGraph(**serialized_cg_info) n_nodes_per_l2_node = [] n_l2_nodes_per_chunk = [] n_l1_nodes_per_chunk = [] # l1_nodes = [] rep_l1_nodes = [] for chunk_coord in chunk_coords: x, y, z = chunk_coord rr = cg.range_read_chunk(layer=2, x=x, y=y, z=z, columns=[column_keys.Hierarchy.Child]) n_l2_nodes_per_chunk.append(len(rr)) n_l1_nodes = 0 for k in rr.keys(): children = rr[k][column_keys.Hierarchy.Child][0].value rep_l1_nodes.append(children[np.random.randint(0, len(children))]) # l1_nodes.extend(children) n_nodes_per_l2_node.append(len(children)) n_l1_nodes += len(children) n_l1_nodes_per_chunk.append(n_l1_nodes) print(f"{len(chunk_coords)} took {time.time() - time_start}s") return n_nodes_per_l2_node, n_l2_nodes_per_chunk, n_l1_nodes_per_chunk, rep_l1_nodes
def initialize_chunkedgraph( meta: ChunkedGraphMeta, cg_mesh_dir="mesh_dir", n_bits_root_counter=8, size=None ): """ Initalizes a chunkedgraph on BigTable """ _check_table_existence(meta.bigtable_config, meta.graph_config) ws_cv = cloudvolume.CloudVolume(meta.data_source.watershed) if size is not None: size = np.array(size) for i in range(len(ws_cv.info["scales"])): original_size = ws_cv.info["scales"][i]["size"] size = np.min([size, original_size], axis=0) ws_cv.info["scales"][i]["size"] = [int(x) for x in size] size[:-1] //= 2 dataset_info = ws_cv.info dataset_info["mesh"] = cg_mesh_dir dataset_info["data_dir"] = meta.data_source.watershed dataset_info["graph"] = { "chunk_size": [int(s) for s in meta.graph_config.chunk_size] } kwargs = { "instance_id": meta.bigtable_config.instance_id, "project_id": meta.bigtable_config.project_id, "table_id": meta.graph_config.graph_id, "chunk_size": meta.graph_config.chunk_size, "fan_out": np.uint64(meta.graph_config.fanout), "n_layers": np.uint64(meta.layer_count), "dataset_info": dataset_info, "use_skip_connections": meta.graph_config.use_skip_connections, "s_bits_atomic_layer": meta.graph_config.s_bits_atomic_layer, "n_bits_root_counter": n_bits_root_counter, "is_new": True, } return chunkedgraph.ChunkedGraph(**kwargs)
def children_test(table_id, layer, coord_list): cg = chunkedgraph.ChunkedGraph(table_id) for coords in coord_list: x, y, z = coords node_ids = cg.range_read_chunk(layer, x, y, z, row_keys=['children']) all_children = [] children_chunks = [] for node_id_b, data in node_ids.items(): children = np.frombuffer(data.cells['0'][b'children'][0].value, dtype=np.uint64) for child in children: all_children.append(child) children_chunks.append(cg.get_chunk_id(child)) u_children_chunks, c_children_chunks = np.unique(children_chunks, return_counts=True) u_chunk_coords = [cg.get_chunk_coordinates(c) for c in u_children_chunks] print("\n--- Layer %d ---- [%d, %d, %d] ---" % (layer, x, y, z)) print("N(all children): %d" % len(all_children)) print("N(unique children): %d" % len(np.unique(all_children))) print("N(unique children chunks): %d" % len(u_children_chunks)) print("Unique children chunk coords", u_chunk_coords) print("N(ids per unique children chunk):", c_children_chunks)
def get_cg(table_id): if table_id not in cache: instance_id = current_app.config['CHUNKGRAPH_INSTANCE_ID'] client = get_bigtable_client(current_app.config) # Create ChunkedGraph logging logger = logging.getLogger(f"{instance_id}/{table_id}") logger.setLevel(current_app.config['LOGGING_LEVEL']) # prevent duplicate logs from Flasks(?) parent logger logger.propagate = False handler = logging.StreamHandler(sys.stdout) handler.setLevel(current_app.config['LOGGING_LEVEL']) formatter = jsonformatter.JsonFormatter( fmt=current_app.config['LOGGING_FORMAT'], datefmt=current_app.config['LOGGING_DATEFORMAT']) formatter.converter = time.gmtime handler.setFormatter(formatter) logger.addHandler(handler) # Create ChunkedGraph cache[table_id] = chunkedgraph.ChunkedGraph(table_id=table_id, instance_id=instance_id, client=client, logger=logger) current_app.table_id = table_id return cache[table_id]
def cg(self): if self._cg is None: self._cg = chunkedgraph.ChunkedGraph( table_id=self.table_id, instance_id=self.instance_id, project_id=self.project_id) return self._cg
def _write_flat_segmentation_thread(args): """ Helper of write_flat_segmentation """ cg_info, start_block, end_block, from_url, to_url, mip = args assert 'segmentation' in to_url assert 'svenmd' in to_url from_cv = cloudvolume.CloudVolume(from_url, mip=mip) to_cv = cloudvolume.CloudVolume(to_url, mip=mip) cg = chunkedgraph.ChunkedGraph(table_id=cg_info["table_id"], instance_id=cg_info["instance_id"], project_id=cg_info["project_id"], credentials=cg_info["credentials"]) for block_z in range(start_block[2], end_block[2]): z_start = block_z * cg.chunk_size[2] z_end = (block_z + 1) * cg.chunk_size[2] for block_y in range(start_block[1], end_block[1]): y_start = block_y * cg.chunk_size[1] y_end = (block_y + 1) * cg.chunk_size[1] for block_x in range(start_block[0], end_block[0]): x_start = block_x * cg.chunk_size[0] x_end = (block_x + 1) * cg.chunk_size[0] block = from_cv[x_start:x_end, y_start:y_end, z_start:z_end] _, remapped_block = get_sv_to_root_id_mapping_chunk( cg, [x_start, y_start, z_start], block) to_cv[x_start:x_end, y_start:y_end, z_start:z_end] = remapped_block
def initialize_chunkedgraph(cg_table_id, ws_cv_path, chunk_size, cg_mesh_dir, fan_out=2, instance_id=None, project_id=None): """ Initalizes a chunkedgraph on BigTable :param cg_table_id: str name of chunkedgraph :param ws_cv_path: str path to watershed segmentation on Google Cloud :param chunk_size: np.ndarray array of three ints :param cg_mesh_dir: str mesh folder name :param fan_out: int fan out of chunked graph (2 == Octree) :param instance_id: str Google instance id :param project_id: str Google project id :return: ChunkedGraph """ ws_cv = cloudvolume.CloudVolume(ws_cv_path) bbox = np.array(ws_cv.bounds.to_list()).reshape(2, 3) # assert np.all(bbox[0] == 0) # assert np.all((bbox[1] % chunk_size) == 0) n_chunks = ((bbox[1] - bbox[0]) / chunk_size).astype(np.int) n_layers = int(np.ceil(chunkedgraph_utils.log_n(np.max(n_chunks), fan_out))) + 2 dataset_info = ws_cv.info dataset_info["mesh"] = cg_mesh_dir dataset_info["data_dir"] = ws_cv_path dataset_info["graph"] = {"chunk_size": [int(s) for s in chunk_size]} kwargs = { "table_id": cg_table_id, "chunk_size": chunk_size, "fan_out": np.uint64(fan_out), "n_layers": np.uint64(n_layers), "dataset_info": dataset_info, "is_new": True } if instance_id is not None: kwargs["instance_id"] = instance_id if project_id is not None: kwargs["project_id"] = project_id cg = chunkedgraph.ChunkedGraph(**kwargs) return cg
def _remeshing(serialized_cg_info, lvl2_nodes): cg = chunkedgraph.ChunkedGraph(**serialized_cg_info) # TODO: stop_layer and mip should be configurable by dataset meshgen.remeshing(cg, lvl2_nodes, stop_layer=4, mesh_path=None, mip=1, max_err=320) return Response(status=200)
def get_merge_candidates(table_id, save_dir=f"{HOME}/benchmarks/", n_threads=1): cg = chunkedgraph.ChunkedGraph(table_id) bounds = np.array(cg.cv.bounds.to_list()).reshape(2, -1).T bounds -= bounds[:, 0:1] chunk_id_bounds = np.ceil((bounds / cg.chunk_size[:, None])).astype(np.int) chunk_coord_gen = itertools.product(*[range(*r) for r in chunk_id_bounds]) chunk_coords = np.array(list(chunk_coord_gen), dtype=np.int) order = np.arange(len(chunk_coords)) np.random.shuffle(order) n_blocks = np.min([len(order), n_threads * 3]) blocks = np.array_split(order, n_blocks) cg_serialized_info = cg.get_serialized_info() if n_threads > 1: del cg_serialized_info["credentials"] multi_args = [] for block in blocks: multi_args.append([cg_serialized_info, chunk_coords[block]]) if n_threads == 1: results = mu.multiprocess_func(_get_merge_candidates, multi_args, n_threads=n_threads, verbose=False, debug=n_threads == 1) else: results = mu.multisubprocess_func(_get_merge_candidates, multi_args, n_threads=n_threads) merge_edges = [] merge_edge_weights = [] for result in results: merge_edges.extend(result[0]) merge_edge_weights.extend(result[1]) save_folder = f"{save_dir}/{table_id}/" if not os.path.exists(save_folder): os.makedirs(save_folder) with h5py.File(f"{save_folder}/merge_edge_stats.h5", "w") as f: f.create_dataset("merge_edges", data=merge_edges, compression="gzip") f.create_dataset("merge_edge_weights", data=merge_edge_weights, compression="gzip")
def _mesh_lvl2_nodes(serialized_cg_info, lvl2_nodes): cg = chunkedgraph.ChunkedGraph(**serialized_cg_info) for lvl2_node in lvl2_nodes: print(lvl2_node) meshgen.mesh_lvl2_preview(cg, lvl2_node, supervoxel_ids=None, cv_path=None, cv_mesh_dir=None, mip=2, simplification_factor=999999, max_err=40, parallel_download=1, verbose=True, cache_control='no-cache') return Response(status=200)
def _get_root_timings(args): serialized_cg_info, l1_ids = args cg = chunkedgraph.ChunkedGraph(**serialized_cg_info) timings = [] for l1_id in l1_ids: time_start = time.time() root = cg.get_root(l1_id) dt = time.time() - time_start timings.append(dt) return timings
def cg(self): if self._cg is None: kwargs = {} if self._instance_id is not None: kwargs["instance_id"] = self._instance_id if self._project_id is not None: kwargs["project_id"] = self._project_id self._cg = chunkedgraph.ChunkedGraph(table_id=self._cg_table_id, **kwargs) return self._cg
def _get_subgraph_timings(args): serialized_cg_info, root_ids, rep_l1_chunk_ids = args cg = chunkedgraph.ChunkedGraph(**serialized_cg_info) timings = [] for root_id, rep_l1_chunk_id in zip(root_ids, rep_l1_chunk_ids): bb = np.array([rep_l1_chunk_id, rep_l1_chunk_id + 1], dtype=np.int) time_start = time.time() sv_ids = cg.get_subgraph_nodes(root_id, bb, bb_is_coordinate=False) dt = time.time() - time_start timings.append(dt) return timings
def _get_merge_candidates(args): serialized_cg_info, chunk_coords = args time_start = time.time() cg = chunkedgraph.ChunkedGraph(**serialized_cg_info) merge_edges = [] merge_edge_weights = [] for chunk_coord in chunk_coords: chunk_id = cg.get_chunk_id(layer=1, x=chunk_coord[0], y=chunk_coord[1], z=chunk_coord[2]) rr = cg.range_read_chunk(chunk_id=chunk_id, columns=[ column_keys.Connectivity.Partner, column_keys.Connectivity.Connected, column_keys.Hierarchy.Parent ]) ps = [] edges = [] for it in rr.items(): e, _, _ = cg._retrieve_connectivity(it, connected_edges=False) edges.extend(e) ps.extend([it[1][column_keys.Hierarchy.Parent][0].value] * len(e)) if len(edges) == 0: continue edges = np.sort(np.array(edges), axis=1) cols = {"sv1": edges[:, 0], "sv2": edges[:, 1], "parent": ps} df = pd.DataFrame(data=cols) dfg = df.groupby(["sv1", "sv2"]).aggregate(np.sum).reset_index() _, i, c = np.unique(dfg[["parent"]], return_counts=True, return_index=True) merge_edges.extend( np.array(dfg.loc[i][["sv1", "sv2"]], dtype=np.uint64)) merge_edge_weights.extend(c) print(f"{len(chunk_coords)} took {time.time() - time_start}s") return merge_edges, merge_edge_weights
def _mesh_lvl2_previews_threads(args): serialized_cg_info, lvl2_node_id, supervoxel_ids, \ cv_path, cv_mesh_dir, mip, simplification_factor, \ max_err, parallel_download, verbose, cache_control = args cg = chunkedgraph.ChunkedGraph(**serialized_cg_info) mesh_lvl2_preview(cg, lvl2_node_id, supervoxel_ids=supervoxel_ids, cv_path=cv_path, cv_mesh_dir=cv_mesh_dir, mip=mip, simplification_factor=simplification_factor, max_err=max_err, parallel_download=parallel_download, verbose=verbose, cache_control=cache_control)
def count_nodes_and_edges(table_id, n_threads=1): cg = chunkedgraph.ChunkedGraph(table_id) bounds = np.array(cg.cv.bounds.to_list()).reshape(2, -1).T bounds -= bounds[:, 0:1] chunk_id_bounds = np.ceil((bounds / cg.chunk_size[:, None])).astype(np.int) chunk_coord_gen = itertools.product(*[range(*r) for r in chunk_id_bounds]) chunk_coords = np.array(list(chunk_coord_gen), dtype=np.int) order = np.arange(len(chunk_coords)) np.random.shuffle(order) n_blocks = np.min([len(order), n_threads * 3]) blocks = np.array_split(order, n_blocks) cg_serialized_info = cg.get_serialized_info() if n_threads > 1: del cg_serialized_info["credentials"] multi_args = [] for block in blocks: multi_args.append([cg_serialized_info, chunk_coords[block]]) if n_threads == 1: results = mu.multiprocess_func(_count_nodes_and_edges, multi_args, n_threads=n_threads, verbose=False, debug=n_threads == 1) else: results = mu.multisubprocess_func(_count_nodes_and_edges, multi_args, n_threads=n_threads) n_edges_per_chunk = [] n_nodes_per_chunk = [] for result in results: n_nodes_per_chunk.extend(result[0]) n_edges_per_chunk.extend(result[1]) return n_nodes_per_chunk, n_edges_per_chunk
def _get_root_ids_and_sv_chunks(args): serialized_cg_info, root_ids = args time_start = time.time() cg = chunkedgraph.ChunkedGraph(**serialized_cg_info) n_l1_nodes_per_root = [] rep_l1_nodes = [] rep_l1_chunk_ids = [] for root_id in root_ids: l1_ids = cg.get_subgraph_nodes(root_id) n_l1_nodes_per_root.append(len(l1_ids)) rep_l1_node = l1_ids[np.random.randint(0, len(l1_ids))] rep_l1_nodes.append(rep_l1_node) rep_l1_chunk_ids.append(cg.get_chunk_coordinates(rep_l1_node)) print(f"{len(root_ids)} took {time.time() - time_start}s") return root_ids, n_l1_nodes_per_root, rep_l1_nodes, rep_l1_chunk_ids
def _read_root_rows_thread(args) -> list: start_seg_id, end_seg_id, serialized_cg_info, time_stamp = args cg = chunkedgraph.ChunkedGraph(**serialized_cg_info) start_id = cg.get_node_id(segment_id=start_seg_id, chunk_id=cg.root_chunk_id) end_id = cg.get_node_id(segment_id=end_seg_id, chunk_id=cg.root_chunk_id) rows = cg.read_node_id_rows(start_id=start_id, end_id=end_id, end_id_inclusive=False, end_time=time_stamp, end_time_inclusive=True) root_ids = [ k for (k, v) in rows.items() if column_keys.Hierarchy.NewParent not in v ] return root_ids
def family_consistency_test(table_id, n_threads=64): """ Runs a simple test on the WHOLE graph tests: id in children(parent(id)) :param table_id: str :param n_threads: int :return: dict n x 2 per layer each failed pair: (node_id, parent_id) """ cg = chunkedgraph.ChunkedGraph(table_id) failed_node_id_dict = {} for layer_id in range(1, cg.n_layers): print("\n\n Layer %d \n\n" % layer_id) step = int(cg.fan_out ** np.max([0, layer_id - 2])) coords = list(itertools.product(range(0, 8, step), range(0, 8, step), range(0, 4, step))) multi_args = [] for coord in coords: multi_args.append([table_id, coord, layer_id]) collected_failed_node_ids = mu.multisubprocess_func( _family_consistency_test_thread, multi_args, n_threads=n_threads) failed_node_ids = [] for _failed_node_ids in collected_failed_node_ids: failed_node_ids.extend(_failed_node_ids) failed_node_id_dict[layer_id] = np.array(failed_node_ids) print("\n%d nodes rows failed\n" % len(failed_node_ids)) return failed_node_id_dict
def _count_nodes_and_edges(args): serialized_cg_info, chunk_coords = args time_start = time.time() cg = chunkedgraph.ChunkedGraph(**serialized_cg_info) n_edges_per_chunk = [] n_nodes_per_chunk = [] for chunk_coord in chunk_coords: x, y, z = chunk_coord rr = cg.range_read_chunk(layer=1, x=x, y=y, z=z) n_nodes_per_chunk.append(len(rr)) n_edges = 0 for k in rr.keys(): n_edges += len(rr[k][column_keys.Connectivity.Partner][0].value) n_edges_per_chunk.append(n_edges) print(f"{len(chunk_coords)} took {time.time() - time_start}s") return n_nodes_per_chunk, n_edges_per_chunk
def _get_merge_timings(args): serialized_cg_info, merge_edges = args cg = chunkedgraph.ChunkedGraph(**serialized_cg_info) merge_timings = [] for merge_edge in merge_edges: time_start = time.time() root_ids = cg.add_edges(user_id="ChuckNorris", atomic_edges=[merge_edge]).new_root_ids dt = time.time() - time_start merge_timings.append(dt) split_timings = [] for merge_edge in merge_edges: time_start = time.time() root_ids = cg.remove_edges(user_id="ChuckNorris", atomic_edges=[merge_edge], mincut=False).new_root_ids dt = time.time() - time_start split_timings.append(dt) return merge_timings, split_timings
def get_cg(table_id): assert ( table_id.startswith("fly") or table_id.startswith("golden") or table_id.startswith("pinky100_rv") or table_id.startswith("pinky100_arv") ) if table_id not in CACHE: instance_id = current_app.config["CHUNKGRAPH_INSTANCE_ID"] client = get_bigtable_client(current_app.config) # Create ChunkedGraph logging logger = logging.getLogger(f"{instance_id}/{table_id}") logger.setLevel(current_app.config["LOGGING_LEVEL"]) # prevent duplicate logs from Flasks(?) parent logger logger.propagate = False handler = logging.StreamHandler(sys.stdout) handler.setLevel(current_app.config["LOGGING_LEVEL"]) formatter = jsonformatter.JsonFormatter( fmt=current_app.config["LOGGING_FORMAT"], datefmt=current_app.config["LOGGING_DATEFORMAT"], ) formatter.converter = time.gmtime handler.setFormatter(formatter) logger.addHandler(handler) # Create ChunkedGraph CACHE[table_id] = chunkedgraph.ChunkedGraph( table_id=table_id, instance_id=instance_id, client=client, logger=logger ) current_app.table_id = table_id return CACHE[table_id]
def _read_delta_root_rows_thread(args) -> Sequence[list]: start_seg_id, end_seg_id, serialized_cg_info, time_stamp_start, time_stamp_end = args cg = chunkedgraph.ChunkedGraph(**serialized_cg_info) start_id = cg.get_node_id(segment_id=start_seg_id, chunk_id=cg.root_chunk_id) end_id = cg.get_node_id(segment_id=end_seg_id, chunk_id=cg.root_chunk_id) # apply column filters to avoid Lock columns rows = cg.read_node_id_rows(start_id=start_id, start_time=time_stamp_start, end_id=end_id, end_id_inclusive=False, columns=[ column_keys.Hierarchy.FormerParent, column_keys.Hierarchy.NewParent ], end_time=time_stamp_end, end_time_inclusive=True) # new roots are those that have no NewParent in this time window new_root_ids = [ k for (k, v) in rows.items() if column_keys.Hierarchy.NewParent not in v ] # expired roots are the IDs of FormerParent's # whose timestamp is before the start_time expired_root_ids = [] for k, v in rows.items(): if column_keys.Hierarchy.FormerParent in v: fp = v[column_keys.Hierarchy.FormerParent] for cell_entry in fp: expired_root_ids.extend(cell_entry.value) return new_root_ids, expired_root_ids
def _cgraph(request, fan_out=2, n_layers=10): # setup Chunked Graph dataset_info = {"data_dir": ""} graph = chunkedgraph.ChunkedGraph( request.function.__name__, project_id="IGNORE_ENVIRONMENT_PROJECT", credentials=credentials.AnonymousCredentials(), instance_id="emulated_instance", dataset_info=dataset_info, chunk_size=np.array([512, 512, 64], dtype=np.uint64), is_new=True, fan_out=np.uint64(fan_out), n_layers=np.uint64(n_layers), ) graph._cv = CloudVolumeMock() # setup Chunked Graph - Finalizer def fin(): graph.table.delete() request.addfinalizer(fin) return graph
def get_root_ids_and_sv_chunks(table_id, save_dir=f"{HOME}/benchmarks/", n_threads=1): cg = chunkedgraph.ChunkedGraph(table_id) save_folder = f"{save_dir}/{table_id}/" if not os.path.exists(save_folder): os.makedirs(save_folder) if not os.path.exists(f"{save_folder}/root_ids.h5"): root_ids = chunkedgraph_comp.get_latest_roots(cg, n_threads=n_threads) with h5py.File(f"{save_folder}/root_ids.h5", "w") as f: f.create_dataset("root_ids", data=root_ids) else: with h5py.File(f"{save_folder}/root_ids.h5", "r") as f: root_ids = f["root_ids"].value cg_serialized_info = cg.get_serialized_info() if n_threads > 1: del cg_serialized_info["credentials"] order = np.arange(len(root_ids)) np.random.shuffle(order) order = order n_blocks = np.min([len(order), n_threads * 3]) blocks = np.array_split(order, n_blocks) multi_args = [] for block in blocks: multi_args.append([cg_serialized_info, root_ids[block]]) if n_threads == 1: results = mu.multiprocess_func(_get_root_ids_and_sv_chunks, multi_args, n_threads=n_threads, verbose=False, debug=n_threads == 1) else: results = mu.multisubprocess_func(_get_root_ids_and_sv_chunks, multi_args, n_threads=n_threads) root_ids = [] n_l1_nodes_per_root = [] rep_l1_nodes = [] rep_l1_chunk_ids = [] for result in results: root_ids.extend(result[0]) n_l1_nodes_per_root.extend(result[1]) rep_l1_nodes.extend(result[2]) rep_l1_chunk_ids.extend(result[3]) save_folder = f"{save_dir}/{table_id}/" if not os.path.exists(save_folder): os.makedirs(save_folder) with h5py.File(f"{save_folder}/root_stats.h5", "w") as f: f.create_dataset("root_ids", data=root_ids, compression="gzip") f.create_dataset("n_l1_nodes_per_root", data=n_l1_nodes_per_root, compression="gzip") f.create_dataset("rep_l1_nodes", data=rep_l1_nodes, compression="gzip") f.create_dataset("rep_l1_chunk_ids", data=rep_l1_chunk_ids, compression="gzip")
def get_cg(): if 'cg' not in g: table_id = current_app.config['CHUNKGRAPH_TABLE_ID'] client = get_client(current_app.config) g.cg = chunkedgraph.ChunkedGraph(table_id=table_id, client=client) return g.cg
def _add_layer_thread(args): """ Creates abstraction layer """ table_id, layer_id, chunk_coords, n_threads_per_process = args cg = chunkedgraph.ChunkedGraph(table_id=table_id) cg.add_layer(layer_id, chunk_coords, n_threads=n_threads_per_process)
def get_merge_split_timings(table_id, save_dir=f"{HOME}/benchmarks/", job_size=500, n_threads=1): save_folder = f"{save_dir}/{table_id}/" merge_edges, merge_edge_weights = load_merge_stats(save_folder) probs = merge_edge_weights / np.sum(merge_edge_weights) if n_threads == 1: n_jobs = n_threads * 3 else: n_jobs = n_threads * 3 cg = chunkedgraph.ChunkedGraph(table_id) cg_serialized_info = cg.get_serialized_info() if n_threads > 0: del cg_serialized_info["credentials"] time_start = time.time() order = np.arange(len(merge_edges)) np.random.seed(np.int(time.time())) replace = False blocks = np.random.choice(order, job_size * n_jobs, p=probs, replace=replace).reshape(n_jobs, job_size) multi_args = [] for block in blocks: multi_args.append([cg_serialized_info, merge_edges[block]]) print(f"Building jobs took {time.time()-time_start}s") time_start = time.time() if n_threads == 1: results = mu.multiprocess_func(_get_merge_timings, multi_args, n_threads=n_threads, verbose=False, debug=n_threads == 1) else: results = mu.multisubprocess_func(_get_merge_timings, multi_args, n_threads=n_threads) dt = time.time() - time_start timings = [] for result in results: timings.extend(result[0]) percentiles = [np.percentile(timings, k) for k in range(1, 100, 1)] mean = np.mean(timings) std = np.std(timings) median = np.median(timings) merge_results = { "percentiles": percentiles, "p01": percentiles[0], "p05": percentiles[4], "p95": percentiles[94], "p99": percentiles[98], "mean": mean, "std": std, "median": median, "total_time_s": dt, "job_size": job_size, "n_jobs": n_jobs, "n_threads": n_threads, "replace": replace, "requests_per_s": job_size * n_jobs / dt } timings = [] for result in results: timings.extend(result[1]) percentiles = [np.percentile(timings, k) for k in range(1, 100, 1)] mean = np.mean(timings) std = np.std(timings) median = np.median(timings) split_results = { "percentiles": percentiles, "p01": percentiles[0], "p05": percentiles[4], "p95": percentiles[94], "p99": percentiles[98], "mean": mean, "std": std, "median": median, "total_time_s": dt, "job_size": job_size, "n_jobs": n_jobs, "n_threads": n_threads, "replace": replace, "requests_per_s": job_size * n_jobs / dt } return merge_results, split_results