Example #1
0
def run_meshlab_script_on_dir(script_name,
                              in_dir,
                              out_dir,
                              suffix,
                              arg_dict={},
                              n_threads=1):
    paths = glob.glob(in_dir + "/*.obj")

    print(len(paths))

    if len(suffix) > 0:
        suffix = "_{}".format(suffix)

    n_jobs = n_threads * 3
    if len(paths) < n_jobs:
        n_jobs = len(paths)

    path_blocks = np.array_split(paths, n_jobs)

    multi_args = []
    for path_block in path_blocks:
        multi_args.append([script_name, path_block, out_dir, suffix, arg_dict])

    if n_threads == 1:
        mu.multiprocess_func(_run_meshlab_script_on_dir_thread,
                             multi_args,
                             debug=True,
                             verbose=True,
                             n_threads=1)
    else:
        mu.multisubprocess_func(_run_meshlab_script_on_dir_thread,
                                multi_args,
                                n_threads=n_threads)
Example #2
0
    def create_manifests_for_higher_layers(self, n_threads=1):
        root_id_max = self.cg.get_max_node_id(
            self.cg.get_chunk_id(layer=np.int(self.cg.n_layers),
                                 x=np.int(0), y=np.int(0),
                                 z=np.int(0)))

        root_id_blocks = np.linspace(1, root_id_max, n_threads*3).astype(np.int)
        cg_info = self.cg.get_serialized_info()
        del (cg_info['credentials'])

        multi_args = []
        for i_block in range(len(root_id_blocks) - 1):
            multi_args.append([cg_info, self.cv_path, self.cv_mesh_dir,
                               root_id_blocks[i_block],
                               root_id_blocks[i_block + 1],
                               self.highest_mesh_layer])

        # Run parallelizing
        if n_threads == 1:
            mu.multiprocess_func(meshgen._create_manifest_files_thread,
                                 multi_args, n_threads=n_threads, verbose=True,
                                 debug=n_threads == 1)
        else:
            mu.multisubprocess_func(meshgen._create_manifest_files_thread,
                                    multi_args, n_threads=n_threads)
Example #3
0
def create_atomic_chunks(im, aff_dtype=np.float32, n_threads=1):
    """ Creates all atomic chunks

    :param im: IngestionManager
    :param aff_dtype: np.dtype
        affinity datatype (np.float32 or np.float64)
    :param n_threads: int
        number of threads to use
    :return:
    """

    im_info = im.get_serialized_info()

    multi_args = []

    # Randomize chunk order
    chunk_coords = list(im.chunk_coord_gen)
    # np.random.shuffle(chunk_coords)

    for i_chunk_coord, chunk_coord in enumerate(chunk_coords):
        multi_args.append([
            im_info, chunk_coord, aff_dtype, i_chunk_coord,
            len(chunk_coords)
        ])

    if n_threads == 1:
        mu.multiprocess_func(_create_atomic_chunk,
                             multi_args,
                             n_threads=n_threads,
                             verbose=True,
                             debug=n_threads == 1)
    else:
        mu.multisubprocess_func(_create_atomic_chunk,
                                multi_args,
                                n_threads=n_threads)
Example #4
0
def rewrite_segmentation(dataset_name, n_threads=64, n_units_per_thread=None):
    if dataset_name == "pinky":
        cv_url = "gs://nkem/pinky40_v11/mst_trimmed_sem_remap/region_graph/"
        from_url = "gs://neuroglancer/pinky40_v11/watershed/"
        to_url = "gs://neuroglancer/svenmd/pinky40_v11/watershed/"
    elif dataset_name == "basil":
        cv_url = "gs://nkem/basil_4k_oldnet/region_graph/"
        from_url = "gs://neuroglancer/ranl/basil_4k_oldnet/ws/"
        to_url = "gs://neuroglancer/svenmd/basil_4k_oldnet_cg/watershed/"
    else:
        raise Exception("Dataset unknown")

    file_paths = np.sort(glob.glob(creator_utils.dir_from_layer_name(
        creator_utils.layer_name_from_cv_url(cv_url)) + "/*rg2cg*"))

    if n_units_per_thread is None:
        file_path_blocks = np.array_split(file_paths, n_threads*3)
    else:
        n_blocks = int(np.ceil(len(file_paths) / n_units_per_thread))
        file_path_blocks = np.array_split(file_paths, n_blocks)

    multi_args = []
    for fp_block in file_path_blocks:
        multi_args.append([fp_block, from_url, to_url])

    # Run parallelizing
    if n_threads == 1:
        mu.multiprocess_func(_rewrite_segmentation_thread, multi_args,
                             n_threads=n_threads, verbose=True,
                             debug=n_threads == 1)
    else:
        mu.multisubprocess_func(_rewrite_segmentation_thread, multi_args,
                                n_threads=n_threads)
Example #5
0
def download_meshes(seg_ids, target_dir, cv_path, n_threads=1):
    """ Downloads meshes in target directory (parallel)

    :param seg_ids: list of ints
    :param target_dir: str
    :param cv_path: str
    :param n_threads: int
    """

    n_jobs = n_threads * 3
    if len(seg_ids) < n_jobs:
        n_jobs = len(seg_ids)

    seg_id_blocks = np.array_split(seg_ids, n_jobs)

    multi_args = []
    for seg_id_block in seg_id_blocks:
        multi_args.append([seg_id_block, cv_path, target_dir])

    if n_jobs == 1:
        mu.multiprocess_func(_download_meshes_thread,
                             multi_args,
                             debug=True,
                             verbose=True,
                             n_threads=1)
    else:
        mu.multisubprocess_func(_download_meshes_thread,
                                multi_args,
                                n_threads=n_threads)
Example #6
0
def download_meshes(seg_ids,
                    target_dir,
                    cv_path,
                    overwrite=True,
                    n_threads=1,
                    verbose=False,
                    merge_large_components=True,
                    remove_duplicate_vertices=True,
                    map_gs_to_https=True,
                    fmt="hdf5"):
    """ Downloads meshes in target directory (in parallel)

    :param seg_ids: list of uint64s
    :param target_dir: str
    :param cv_path: str
    :param overwrite: bool
    :param n_threads: int
    :param verbose: bool
    :param merge_large_components: bool
    :param remove_duplicate_vertices: bool
    :param fmt: str
        "h5" is highly recommended
    """

    if n_threads > 1:
        n_jobs = n_threads * 3
    else:
        n_jobs = 1

    if len(seg_ids) < n_jobs:
        n_jobs = len(seg_ids)

    seg_id_blocks = np.array_split(seg_ids, n_jobs)

    multi_args = []
    for seg_id_block in seg_id_blocks:
        multi_args.append([
            seg_id_block, cv_path, target_dir, fmt, overwrite,
            merge_large_components, remove_duplicate_vertices, map_gs_to_https
        ])

    if n_jobs == 1:
        mu.multiprocess_func(_download_meshes_thread,
                             multi_args,
                             debug=True,
                             verbose=verbose,
                             n_threads=n_threads)
    else:
        mu.multisubprocess_func(_download_meshes_thread,
                                multi_args,
                                n_threads=n_threads,
                                package_name="meshparty",
                                n_retries=40)
Example #7
0
def rechunk_dataset(dataset_name,
                    block_size=(1024, 1024, 64),
                    n_threads=64,
                    mip=0):
    if dataset_name == "pinky40em":
        from_url = "gs://neuroglancer/pinky40_v11/image_rechunked/"
        to_url = "gs://neuroglancer/svenmd/pinky40_v11/image_512_512_32/"
    elif dataset_name == "pinky100seg":
        from_url = "gs://neuroglancer/nkem/pinky100_v0/ws/lost_no-random/bbox1_0/"
        to_url = "gs://neuroglancer/svenmd/pinky100_v0/ws/lost_no-random/bbox1_0_64_64_16/"
    elif dataset_name == "basil":
        raise ()
    else:
        raise Exception("Dataset unknown")

    from_cv = cloudvolume.CloudVolume(from_url, mip=mip)

    dataset_bounds = np.array(from_cv.bounds.to_list())
    block_size = np.array(list(block_size))

    super_block_size = block_size * 2

    coordinate_iter = itertools.product(
        np.arange(dataset_bounds[0], dataset_bounds[3], super_block_size[0]),
        np.arange(dataset_bounds[1], dataset_bounds[4], super_block_size[1]),
        np.arange(dataset_bounds[2], dataset_bounds[5], super_block_size[2]))
    coordinates = np.array(list(coordinate_iter))

    multi_args = []
    for coordinate in coordinates:
        end_coordinate = coordinate + super_block_size
        m = end_coordinate > dataset_bounds[3:]
        end_coordinate[m] = dataset_bounds[3:][m]

        multi_args.append(
            [coordinate, end_coordinate, block_size, from_url, to_url, mip])

    # Run parallelizing
    if n_threads == 1:
        mu.multiprocess_func(_rewrite_image_thread,
                             multi_args,
                             n_threads=n_threads,
                             verbose=True,
                             debug=n_threads == 1)
    else:
        mu.multisubprocess_func(_rewrite_image_thread,
                                multi_args,
                                n_threads=n_threads)
Example #8
0
def create_layer(im, layer_id, n_threads=1):
    """ Creates abstract layer of chunkedgraph

    Abstract layers have to be build in sequence. Abstract layers are all layers
    above the first layer (1). `create_atomic_chunks` creates layer 2 as well.
    Hence, this function is responsible for every creating layers > 2.

    :param im: IngestionManager
    :param layer_id: int
        > 2
    :param n_threads: int
        number of threads to use
    :return:
    """
    assert layer_id > 2

    child_chunk_coords = im.chunk_coords // im.cg.fan_out**(layer_id - 3)
    child_chunk_coords = child_chunk_coords.astype(np.int)
    child_chunk_coords = np.unique(child_chunk_coords, axis=0)

    parent_chunk_coords = child_chunk_coords // im.cg.fan_out
    parent_chunk_coords = parent_chunk_coords.astype(np.int)
    parent_chunk_coords, inds = np.unique(parent_chunk_coords,
                                          axis=0,
                                          return_inverse=True)

    im_info = im.get_serialized_info()
    multi_args = []

    # Randomize chunks
    order = np.arange(len(parent_chunk_coords), dtype=np.int)
    np.random.shuffle(order)

    for i_chunk, idx in enumerate(order):
        multi_args.append([
            im_info, layer_id, child_chunk_coords[inds == idx], i_chunk,
            len(order)
        ])

    if n_threads == 1:
        mu.multiprocess_func(_create_layer,
                             multi_args,
                             n_threads=n_threads,
                             verbose=True,
                             debug=n_threads == 1)
    else:
        mu.multisubprocess_func(_create_layer, multi_args, n_threads=n_threads)
def get_delta_roots(cg,
                    time_stamp_start: datetime.datetime,
                    time_stamp_end: Optional[datetime.datetime] = None,
                    min_seg_id: int = 1,
                    n_threads: int = 1) -> Sequence[np.uint64]:

    # Create filters: time and id range
    max_seg_id = cg.get_max_seg_id(cg.root_chunk_id) + 1

    n_blocks = int(np.min([n_threads + 1, max_seg_id - min_seg_id + 1]))
    seg_id_blocks = np.linspace(min_seg_id,
                                max_seg_id,
                                n_blocks,
                                dtype=np.uint64)

    cg_serialized_info = cg.get_serialized_info()

    if n_threads > 1:
        del cg_serialized_info["credentials"]

    multi_args = []
    for i_id_block in range(0, len(seg_id_blocks) - 1):
        multi_args.append([
            seg_id_blocks[i_id_block], seg_id_blocks[i_id_block + 1],
            cg_serialized_info, time_stamp_start, time_stamp_end
        ])

    # Run parallelizing
    if n_threads == 1:
        results = mu.multiprocess_func(_read_delta_root_rows_thread,
                                       multi_args,
                                       n_threads=n_threads,
                                       verbose=False,
                                       debug=n_threads == 1)
    else:
        results = mu.multisubprocess_func(_read_delta_root_rows_thread,
                                          multi_args,
                                          n_threads=n_threads)

    # aggregate all the results together
    new_root_ids = []
    expired_root_id_candidates = []
    for r1, r2 in results:
        new_root_ids.extend(r1)
        expired_root_id_candidates.extend(r2)
    expired_root_id_candidates = np.array(expired_root_id_candidates,
                                          dtype=np.uint64)
    # filter for uniqueness
    expired_root_id_candidates = np.unique(expired_root_id_candidates)

    # filter out the expired root id's whose creation (measured by the timestamp
    # of their Child links) is after the time_stamp_start
    rows = cg.read_node_id_rows(node_ids=expired_root_id_candidates,
                                columns=[column_keys.Hierarchy.Child],
                                end_time=time_stamp_start)
    expired_root_ids = np.array([k for (k, v) in rows.items()],
                                dtype=np.uint64)

    return np.array(new_root_ids, dtype=np.uint64), expired_root_ids
Example #10
0
def download_and_store_cv_files(dataset_name="basil",
                                n_threads=10,
                                olduint32=False):
    """ Downloads files from google cloud using cloud-volume

    :param dataset_name: str
    :param n_threads: int
    :param olduint32: bool
    """
    if "basil" == dataset_name:
        cv_url = "gs://nkem/basil_4k_oldnet/region_graph/"
    elif "pinky40" == dataset_name:
        cv_url = "gs://nkem/pinky40_v11/mst_trimmed_sem_remap/region_graph/"
    elif "pinky100" == dataset_name:
        cv_url = "gs://nkem/pinky100_v0/region_graph/"
    else:
        raise Exception("Could not identify region graph ressource")

    with storage.SimpleStorage(cv_url) as cv_st:
        dir_path = creator_utils.dir_from_layer_name(
            creator_utils.layer_name_from_cv_url(cv_st.layer_path))

        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

        file_paths = list(cv_st.list_files())

    file_chunks = np.array_split(file_paths, n_threads * 3)
    multi_args = []
    for i_file_chunk, file_chunk in enumerate(file_chunks):
        multi_args.append([i_file_chunk, cv_url, file_chunk, olduint32])

    # Run parallelizing
    if n_threads == 1:
        mu.multiprocess_func(_download_and_store_cv_files_thread,
                             multi_args,
                             n_threads=n_threads,
                             verbose=True,
                             debug=n_threads == 1)
    else:
        mu.multisubprocess_func(_download_and_store_cv_files_thread,
                                multi_args,
                                n_threads=n_threads)
def get_merge_candidates(table_id,
                         save_dir=f"{HOME}/benchmarks/",
                         n_threads=1):
    cg = chunkedgraph.ChunkedGraph(table_id)

    bounds = np.array(cg.cv.bounds.to_list()).reshape(2, -1).T
    bounds -= bounds[:, 0:1]

    chunk_id_bounds = np.ceil((bounds / cg.chunk_size[:, None])).astype(np.int)

    chunk_coord_gen = itertools.product(*[range(*r) for r in chunk_id_bounds])
    chunk_coords = np.array(list(chunk_coord_gen), dtype=np.int)

    order = np.arange(len(chunk_coords))
    np.random.shuffle(order)

    n_blocks = np.min([len(order), n_threads * 3])
    blocks = np.array_split(order, n_blocks)

    cg_serialized_info = cg.get_serialized_info()
    if n_threads > 1:
        del cg_serialized_info["credentials"]

    multi_args = []
    for block in blocks:
        multi_args.append([cg_serialized_info, chunk_coords[block]])

    if n_threads == 1:
        results = mu.multiprocess_func(_get_merge_candidates,
                                       multi_args,
                                       n_threads=n_threads,
                                       verbose=False,
                                       debug=n_threads == 1)
    else:
        results = mu.multisubprocess_func(_get_merge_candidates,
                                          multi_args,
                                          n_threads=n_threads)
    merge_edges = []
    merge_edge_weights = []
    for result in results:
        merge_edges.extend(result[0])
        merge_edge_weights.extend(result[1])

    save_folder = f"{save_dir}/{table_id}/"

    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    with h5py.File(f"{save_folder}/merge_edge_stats.h5", "w") as f:
        f.create_dataset("merge_edges", data=merge_edges, compression="gzip")
        f.create_dataset("merge_edge_weights",
                         data=merge_edge_weights,
                         compression="gzip")
Example #12
0
def mesh_lvl2_previews(cg,
                       lvl2_node_ids,
                       cv_path=None,
                       cv_mesh_dir=None,
                       mip=2,
                       simplification_factor=999999,
                       max_err=40,
                       parallel_download=8,
                       verbose=True,
                       cache_control="no-cache",
                       n_threads=1):

    serialized_cg_info = cg.get_serialized_info()
    del serialized_cg_info["credentials"]

    if not isinstance(lvl2_node_ids, dict):
        lvl2_node_ids = dict(zip(lvl2_node_ids, [None] * len(lvl2_node_ids)))

    mesh_dir = cv_mesh_dir or cg._mesh_dir

    multi_args = []
    for lvl2_node_id in lvl2_node_ids.keys():
        multi_args.append([
            serialized_cg_info, lvl2_node_id, lvl2_node_ids[lvl2_node_id],
            cv_path, mesh_dir, mip, simplification_factor, max_err,
            parallel_download, verbose, cache_control
        ])

    # Run parallelizing
    if n_threads == 1:
        mu.multiprocess_func(_mesh_lvl2_previews_threads,
                             multi_args,
                             n_threads=n_threads,
                             verbose=False,
                             debug=n_threads == 1)
    else:
        mu.multisubprocess_func(_mesh_lvl2_previews_threads,
                                multi_args,
                                n_threads=n_threads)
def count_nodes_and_edges(table_id, n_threads=1):
    cg = chunkedgraph.ChunkedGraph(table_id)

    bounds = np.array(cg.cv.bounds.to_list()).reshape(2, -1).T
    bounds -= bounds[:, 0:1]

    chunk_id_bounds = np.ceil((bounds / cg.chunk_size[:, None])).astype(np.int)

    chunk_coord_gen = itertools.product(*[range(*r) for r in chunk_id_bounds])
    chunk_coords = np.array(list(chunk_coord_gen), dtype=np.int)

    order = np.arange(len(chunk_coords))
    np.random.shuffle(order)

    n_blocks = np.min([len(order), n_threads * 3])
    blocks = np.array_split(order, n_blocks)

    cg_serialized_info = cg.get_serialized_info()
    if n_threads > 1:
        del cg_serialized_info["credentials"]

    multi_args = []
    for block in blocks:
        multi_args.append([cg_serialized_info, chunk_coords[block]])

    if n_threads == 1:
        results = mu.multiprocess_func(_count_nodes_and_edges,
                                       multi_args,
                                       n_threads=n_threads,
                                       verbose=False,
                                       debug=n_threads == 1)
    else:
        results = mu.multisubprocess_func(_count_nodes_and_edges,
                                          multi_args,
                                          n_threads=n_threads)

    n_edges_per_chunk = []
    n_nodes_per_chunk = []
    for result in results:
        n_nodes_per_chunk.extend(result[0])
        n_edges_per_chunk.extend(result[1])

    return n_nodes_per_chunk, n_edges_per_chunk
def get_latest_roots(cg,
                     time_stamp: Optional[datetime.datetime] = None,
                     n_threads: int = 1) -> Sequence[np.uint64]:

    # Create filters: time and id range
    max_seg_id = cg.get_max_seg_id(cg.root_chunk_id) + 1

    if n_threads == 1:
        n_blocks = 1
    else:
        n_blocks = int(np.min([n_threads * 3 + 1, max_seg_id]))

    seg_id_blocks = np.linspace(1, max_seg_id, n_blocks + 1, dtype=np.uint64)

    cg_serialized_info = cg.get_serialized_info()

    if n_threads > 1:
        del cg_serialized_info["credentials"]

    multi_args = []
    for i_id_block in range(0, len(seg_id_blocks) - 1):
        multi_args.append([
            seg_id_blocks[i_id_block], seg_id_blocks[i_id_block + 1],
            cg_serialized_info, time_stamp
        ])

    if n_threads == 1:
        results = mu.multiprocess_func(_read_root_rows_thread,
                                       multi_args,
                                       n_threads=n_threads,
                                       verbose=False,
                                       debug=n_threads == 1)
    else:
        results = mu.multisubprocess_func(_read_root_rows_thread,
                                          multi_args,
                                          n_threads=n_threads)

    root_ids = []
    for result in results:
        root_ids.extend(result)

    return np.array(root_ids, dtype=np.uint64)
Example #15
0
def family_consistency_test(table_id, n_threads=64):
    """ Runs a simple test on the WHOLE graph

    tests: id in children(parent(id))

    :param table_id: str
    :param n_threads: int
    :return: dict
        n x 2 per layer
        each failed pair: (node_id, parent_id)
    """

    cg = chunkedgraph.ChunkedGraph(table_id)

    failed_node_id_dict = {}
    for layer_id in range(1, cg.n_layers):
        print("\n\n Layer %d \n\n" % layer_id)

        step = int(cg.fan_out ** np.max([0, layer_id - 2]))
        coords = list(itertools.product(range(0, 8, step),
                                        range(0, 8, step),
                                        range(0, 4, step)))

        multi_args = []
        for coord in coords:
            multi_args.append([table_id, coord, layer_id])

        collected_failed_node_ids = mu.multisubprocess_func(
            _family_consistency_test_thread, multi_args, n_threads=n_threads)

        failed_node_ids = []
        for _failed_node_ids in collected_failed_node_ids:
            failed_node_ids.extend(_failed_node_ids)

        failed_node_id_dict[layer_id] = np.array(failed_node_ids)

        print("\n%d nodes rows failed\n" % len(failed_node_ids))

    return failed_node_id_dict
Example #16
0
def write_flat_segmentation(cg,
                            dataset_name,
                            bounding_box=None,
                            block_factor=2,
                            n_threads=1,
                            mip=0):
    """ Applies the mapping in the chunkedgraph to the supervoxels to create
        a flattened segmentation

    :param cg: chunkedgraph instance
    :param dataset_name: str
    :param bounding_box: np.array
    :param block_factor: int
    :param n_threads: int
    :param mip: int
    :return: bool
    """

    if dataset_name == "pinky":
        from_url = "gs://neuroglancer/svenmd/pinky40_v11/watershed/"
        to_url = "gs://neuroglancer/svenmd/pinky40_v11/segmentation/"
    elif dataset_name == "basil":
        from_url = "gs://neuroglancer/svenmd/basil_4k_oldnet_cg/watershed/"
        to_url = "gs://neuroglancer/svenmd/basil_4k_oldnet_cg/segmentation/"
    else:
        raise Exception("Dataset unknown")

    from_cv = cloudvolume.CloudVolume(from_url, mip=mip)

    dataset_bounding_box = np.array(from_cv.bounds.to_list())

    block_bounding_box_cg = \
        [np.floor(dataset_bounding_box[:3] / cg.chunk_size).astype(np.int),
         np.ceil(dataset_bounding_box[3:] / cg.chunk_size).astype(np.int)]

    if bounding_box is not None:
        bounding_box_cg = \
            [np.floor(bounding_box[0] / cg.chunk_size).astype(np.int),
             np.ceil(bounding_box[1] / cg.chunk_size).astype(np.int)]

        m = block_bounding_box_cg[0] < bounding_box_cg[0]
        block_bounding_box_cg[0][m] = bounding_box_cg[0][m]

        m = block_bounding_box_cg[1] > bounding_box_cg[1]
        block_bounding_box_cg[1][m] = bounding_box_cg[1][m]

    block_iter = itertools.product(
        np.arange(block_bounding_box_cg[0][0], block_bounding_box_cg[1][0],
                  block_factor),
        np.arange(block_bounding_box_cg[0][1], block_bounding_box_cg[1][1],
                  block_factor),
        np.arange(block_bounding_box_cg[0][2], block_bounding_box_cg[1][2],
                  block_factor))
    blocks = np.array(list(block_iter))

    cg_info = cg.get_serialized_info()

    multi_args = []
    for start_block in blocks:
        end_block = start_block + block_factor
        m = end_block > block_bounding_box_cg[1]
        end_block[m] = block_bounding_box_cg[1][m]

        multi_args.append(
            [cg_info, start_block, end_block, from_url, to_url, mip])

    # Run parallelizing
    if n_threads == 1:
        mu.multiprocess_func(_write_flat_segmentation_thread,
                             multi_args,
                             n_threads=n_threads,
                             verbose=True,
                             debug=n_threads == 1)
    else:
        mu.multisubprocess_func(_write_flat_segmentation_thread,
                                multi_args,
                                n_threads=n_threads)
Example #17
0
def create_chunked_graph(table_id=None,
                         cv_url=None,
                         ws_url=None,
                         fan_out=2,
                         bbox=None,
                         chunk_size=(512, 512, 128),
                         verbose=False,
                         n_threads=1):
    """ Creates chunked graph from downloaded files

    :param table_id: str
    :param cv_url: str
    :param ws_url: str
    :param fan_out: int
    :param bbox: [[x_, y_, z_], [_x, _y, _z]]
    :param chunk_size: tuple
    :param verbose: bool
    :param n_threads: int
    """
    if cv_url is None or ws_url is None:
        if "basil" in table_id:
            cv_url = "gs://nkem/basil_4k_oldnet/region_graph/"
            ws_url = "gs://neuroglancer/svenmd/basil_4k_oldnet_cg/watershed/"
        elif "pinky40" in table_id:
            cv_url = "gs://nkem/pinky40_v11/mst_trimmed_sem_remap/region_graph/"
            ws_url = "gs://neuroglancer/svenmd/pinky40_v11/watershed/"
        elif "pinky100" in table_id:
            cv_url = "gs://nkem/pinky100_v0/region_graph/"
            ws_url = "gs://neuroglancer/nkem/pinky100_v0/ws/lost_no-random/bbox1_0/"
        else:
            raise Exception("Could not identify region graph ressource")

    times = []
    time_start = time.time()

    chunk_size = np.array(list(chunk_size))

    file_paths = np.sort(
        glob.glob(
            creator_utils.dir_from_layer_name(
                creator_utils.layer_name_from_cv_url(cv_url)) + "/*"))

    file_path_blocks = np.array_split(file_paths, n_threads * 3)

    multi_args = []
    for fp_block in file_path_blocks:
        multi_args.append([fp_block, table_id, chunk_size, bbox])

    if n_threads == 1:
        results = mu.multiprocess_func(_preprocess_chunkedgraph_data_thread,
                                       multi_args,
                                       n_threads=n_threads,
                                       verbose=True,
                                       debug=n_threads == 1)
    else:
        results = mu.multisubprocess_func(_preprocess_chunkedgraph_data_thread,
                                          multi_args,
                                          n_threads=n_threads)

    in_chunk_connected_paths = np.array([])
    in_chunk_connected_ids = np.array([], dtype=np.uint64).reshape(-1, 3)
    in_chunk_disconnected_paths = np.array([])
    in_chunk_disconnected_ids = np.array([], dtype=np.uint64).reshape(-1, 3)
    between_chunk_paths = np.array([])
    between_chunk_ids = np.array([], dtype=np.uint64).reshape(-1, 2, 3)
    isolated_paths = np.array([])
    isolated_ids = np.array([], dtype=np.uint64).reshape(-1, 3)

    for result in results:
        in_chunk_connected_paths = np.concatenate(
            [in_chunk_connected_paths, result[0]])
        in_chunk_connected_ids = np.concatenate(
            [in_chunk_connected_ids, result[1]])
        in_chunk_disconnected_paths = np.concatenate(
            [in_chunk_disconnected_paths, result[2]])
        in_chunk_disconnected_ids = np.concatenate(
            [in_chunk_disconnected_ids, result[3]])
        between_chunk_paths = np.concatenate([between_chunk_paths, result[4]])
        between_chunk_ids = np.concatenate([between_chunk_ids, result[5]])
        isolated_paths = np.concatenate([isolated_paths, result[6]])
        isolated_ids = np.concatenate([isolated_ids, result[7]])

    assert len(in_chunk_connected_ids) == len(in_chunk_connected_paths) == \
           len(in_chunk_disconnected_ids) == len(in_chunk_disconnected_paths) == \
           len(isolated_ids) == len(isolated_paths)

    in_chunk_connected_ids, in_chunk_connected_paths = \
        _sort_arrays(in_chunk_connected_ids, in_chunk_connected_paths)

    in_chunk_disconnected_ids, in_chunk_disconnected_paths = \
        _sort_arrays(in_chunk_disconnected_ids, in_chunk_disconnected_paths)

    isolated_ids, isolated_paths = \
        _sort_arrays(isolated_ids, isolated_paths)

    times.append(["Preprocessing", time.time() - time_start])

    print("Preprocessing took %.3fs = %.2fh" %
          (times[-1][1], times[-1][1] / 3600))

    time_start = time.time()

    multi_args = []

    in_chunk_id_blocks = np.array_split(in_chunk_connected_ids,
                                        max(1, n_threads))
    cumsum = 0

    for in_chunk_id_block in in_chunk_id_blocks:
        multi_args.append([
            between_chunk_ids, between_chunk_paths, in_chunk_id_block, cumsum
        ])
        cumsum += len(in_chunk_id_block)

    # Run parallelizing
    if n_threads == 1:
        results = mu.multiprocess_func(_between_chunk_masks_thread,
                                       multi_args,
                                       n_threads=n_threads,
                                       verbose=True,
                                       debug=n_threads == 1)
    else:
        results = mu.multisubprocess_func(_between_chunk_masks_thread,
                                          multi_args,
                                          n_threads=n_threads)

    times.append(["Data sorting", time.time() - time_start])

    print("Data sorting took %.3fs = %.2fh" %
          (times[-1][1], times[-1][1] / 3600))

    time_start = time.time()

    n_layers = int(
        np.ceil(
            pychunkedgraph.backend.chunkedgraph_utils.log_n(
                np.max(in_chunk_connected_ids) + 1, fan_out))) + 2

    print("N layers: %d" % n_layers)

    cg = chunkedgraph.ChunkedGraph(table_id=table_id,
                                   n_layers=np.uint64(n_layers),
                                   fan_out=np.uint64(fan_out),
                                   chunk_size=np.array(chunk_size,
                                                       dtype=np.uint64),
                                   cv_path=ws_url,
                                   is_new=True)

    # Fill lowest layer and create first abstraction layer
    # Create arguments for parallelizing

    multi_args = []
    for result in results:
        offset, between_chunk_paths_out_masked, between_chunk_paths_in_masked = result

        for i_chunk in range(len(between_chunk_paths_out_masked)):
            multi_args.append([
                table_id, in_chunk_connected_paths[offset + i_chunk],
                in_chunk_disconnected_paths[offset + i_chunk],
                isolated_paths[offset + i_chunk],
                between_chunk_paths_in_masked[i_chunk],
                between_chunk_paths_out_masked[i_chunk], verbose
            ])

    random.shuffle(multi_args)

    print("%d jobs for creating layer 1 + 2" % len(multi_args))

    # Run parallelizing
    if n_threads == 1:
        mu.multiprocess_func(_create_atomic_layer_thread,
                             multi_args,
                             n_threads=n_threads,
                             verbose=True,
                             debug=n_threads == 1)
    else:
        mu.multisubprocess_func(_create_atomic_layer_thread,
                                multi_args,
                                n_threads=n_threads)

    times.append(["Layers 1 + 2", time.time() - time_start])

    # Fill higher abstraction layers
    child_chunk_ids = in_chunk_connected_ids.copy()
    for layer_id in range(3, n_layers + 1):

        time_start = time.time()

        print("\n\n\n --- LAYER %d --- \n\n\n" % layer_id)

        parent_chunk_ids = child_chunk_ids // cg.fan_out
        parent_chunk_ids = parent_chunk_ids.astype(np.int)

        u_pcids, inds = np.unique(parent_chunk_ids,
                                  axis=0,
                                  return_inverse=True)

        if len(u_pcids) > n_threads:
            n_threads_per_process = 1
        else:
            n_threads_per_process = int(np.ceil(n_threads / len(u_pcids)))

        multi_args = []
        for ind in range(len(u_pcids)):
            multi_args.append([
                table_id, layer_id,
                child_chunk_ids[inds == ind].astype(np.int),
                n_threads_per_process
            ])

        child_chunk_ids = u_pcids

        # Run parallelizing
        if n_threads == 1:
            mu.multiprocess_func(_add_layer_thread,
                                 multi_args,
                                 n_threads=n_threads,
                                 verbose=True,
                                 debug=n_threads == 1)
        else:
            mu.multisubprocess_func(_add_layer_thread,
                                    multi_args,
                                    n_threads=n_threads,
                                    suffix=str(layer_id))

        times.append(["Layer %d" % layer_id, time.time() - time_start])

    for time_entry in times:
        print("%s: %.2fs = %.2fmin = %.2fh" %
              (time_entry[0], time_entry[1], time_entry[1] / 60,
               time_entry[1] / 3600))
Example #18
0
    def mesh_single_layer(self, layer, bounding_box=None, block_factor=2,
                          n_threads=128):
        assert layer <= self.highest_mesh_layer

        dataset_bounding_box = np.array(self.cv.bounds.to_list())

        block_bounding_box_cg = \
            [np.floor(dataset_bounding_box[:3] /
                      self.cg.chunk_size).astype(np.int),
             np.ceil(dataset_bounding_box[3:] /
                     self.cg.chunk_size).astype(np.int)]

        if bounding_box is not None:
            bounding_box_cg = \
                [np.floor(bounding_box[0] /
                          self.cg.chunk_size).astype(np.int),
                 np.ceil(bounding_box[1] /
                         self.cg.chunk_size).astype(np.int)]

            m = block_bounding_box_cg[0] < bounding_box_cg[0]
            block_bounding_box_cg[0][m] = bounding_box_cg[0][m]

            m = block_bounding_box_cg[1] > bounding_box_cg[1]
            block_bounding_box_cg[1][m] = bounding_box_cg[1][m]

        block_bounding_box_cg /= 2 ** np.max([0, layer - 2])
        block_bounding_box_cg = np.ceil(block_bounding_box_cg)

        n_jobs = np.product(block_bounding_box_cg[1] -
                            block_bounding_box_cg[0]) / \
                 block_factor ** 2 < n_threads

        while n_jobs < n_threads and block_factor > 1:
            block_factor -= 1

            n_jobs = np.product(block_bounding_box_cg[1] -
                                block_bounding_box_cg[0]) / \
                     block_factor ** 2 < n_threads

        block_iter = itertools.product(np.arange(block_bounding_box_cg[0][0],
                                                 block_bounding_box_cg[1][0],
                                                 block_factor),
                                       np.arange(block_bounding_box_cg[0][1],
                                                 block_bounding_box_cg[1][1],
                                                 block_factor),
                                       np.arange(block_bounding_box_cg[0][2],
                                                 block_bounding_box_cg[1][2],
                                                 block_factor))

        blocks = np.array(list(block_iter), dtype=np.int)

        cg_info = self.cg.get_serialized_info()
        del (cg_info['credentials'])

        multi_args = []
        for start_block in blocks:
            end_block = start_block + block_factor
            m = end_block > block_bounding_box_cg[1]
            end_block[m] = block_bounding_box_cg[1][m]

            multi_args.append([cg_info, start_block, end_block, self.cg.cv_path,
                               self.cv_mesh_dir, self.mesh_mip, layer])
        random.shuffle(multi_args)

        random.shuffle(multi_args)

        # Run parallelizing
        if n_threads == 1:
            mu.multiprocess_func(meshgen._mesh_layer_thread, multi_args,
                                 n_threads=n_threads, verbose=True,
                                 debug=n_threads == 1)
        else:
            mu.multisubprocess_func(meshgen._mesh_layer_thread, multi_args,
                                    n_threads=n_threads,
                                    suffix="%s_%d" % (self.table_id, layer))
def get_root_ids_and_sv_chunks(table_id,
                               save_dir=f"{HOME}/benchmarks/",
                               n_threads=1):
    cg = chunkedgraph.ChunkedGraph(table_id)

    save_folder = f"{save_dir}/{table_id}/"

    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    if not os.path.exists(f"{save_folder}/root_ids.h5"):
        root_ids = chunkedgraph_comp.get_latest_roots(cg, n_threads=n_threads)

        with h5py.File(f"{save_folder}/root_ids.h5", "w") as f:
            f.create_dataset("root_ids", data=root_ids)
    else:
        with h5py.File(f"{save_folder}/root_ids.h5", "r") as f:
            root_ids = f["root_ids"].value

    cg_serialized_info = cg.get_serialized_info()
    if n_threads > 1:
        del cg_serialized_info["credentials"]

    order = np.arange(len(root_ids))
    np.random.shuffle(order)

    order = order

    n_blocks = np.min([len(order), n_threads * 3])
    blocks = np.array_split(order, n_blocks)

    multi_args = []
    for block in blocks:
        multi_args.append([cg_serialized_info, root_ids[block]])

    if n_threads == 1:
        results = mu.multiprocess_func(_get_root_ids_and_sv_chunks,
                                       multi_args,
                                       n_threads=n_threads,
                                       verbose=False,
                                       debug=n_threads == 1)
    else:
        results = mu.multisubprocess_func(_get_root_ids_and_sv_chunks,
                                          multi_args,
                                          n_threads=n_threads)

    root_ids = []
    n_l1_nodes_per_root = []
    rep_l1_nodes = []
    rep_l1_chunk_ids = []
    for result in results:
        root_ids.extend(result[0])
        n_l1_nodes_per_root.extend(result[1])
        rep_l1_nodes.extend(result[2])
        rep_l1_chunk_ids.extend(result[3])

    save_folder = f"{save_dir}/{table_id}/"

    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    with h5py.File(f"{save_folder}/root_stats.h5", "w") as f:
        f.create_dataset("root_ids", data=root_ids, compression="gzip")
        f.create_dataset("n_l1_nodes_per_root",
                         data=n_l1_nodes_per_root,
                         compression="gzip")
        f.create_dataset("rep_l1_nodes", data=rep_l1_nodes, compression="gzip")
        f.create_dataset("rep_l1_chunk_ids",
                         data=rep_l1_chunk_ids,
                         compression="gzip")
Example #20
0
def get_merge_split_timings(table_id,
                            save_dir=f"{HOME}/benchmarks/",
                            job_size=500,
                            n_threads=1):
    save_folder = f"{save_dir}/{table_id}/"

    merge_edges, merge_edge_weights = load_merge_stats(save_folder)

    probs = merge_edge_weights / np.sum(merge_edge_weights)

    if n_threads == 1:
        n_jobs = n_threads * 3
    else:
        n_jobs = n_threads * 3

    cg = chunkedgraph.ChunkedGraph(table_id)
    cg_serialized_info = cg.get_serialized_info()
    if n_threads > 0:
        del cg_serialized_info["credentials"]

    time_start = time.time()
    order = np.arange(len(merge_edges))

    np.random.seed(np.int(time.time()))

    replace = False

    blocks = np.random.choice(order,
                              job_size * n_jobs,
                              p=probs,
                              replace=replace).reshape(n_jobs, job_size)

    multi_args = []
    for block in blocks:
        multi_args.append([cg_serialized_info, merge_edges[block]])
    print(f"Building jobs took {time.time()-time_start}s")

    time_start = time.time()
    if n_threads == 1:
        results = mu.multiprocess_func(_get_merge_timings,
                                       multi_args,
                                       n_threads=n_threads,
                                       verbose=False,
                                       debug=n_threads == 1)
    else:
        results = mu.multisubprocess_func(_get_merge_timings,
                                          multi_args,
                                          n_threads=n_threads)
    dt = time.time() - time_start

    timings = []
    for result in results:
        timings.extend(result[0])

    percentiles = [np.percentile(timings, k) for k in range(1, 100, 1)]
    mean = np.mean(timings)
    std = np.std(timings)
    median = np.median(timings)

    merge_results = {
        "percentiles": percentiles,
        "p01": percentiles[0],
        "p05": percentiles[4],
        "p95": percentiles[94],
        "p99": percentiles[98],
        "mean": mean,
        "std": std,
        "median": median,
        "total_time_s": dt,
        "job_size": job_size,
        "n_jobs": n_jobs,
        "n_threads": n_threads,
        "replace": replace,
        "requests_per_s": job_size * n_jobs / dt
    }

    timings = []
    for result in results:
        timings.extend(result[1])

    percentiles = [np.percentile(timings, k) for k in range(1, 100, 1)]
    mean = np.mean(timings)
    std = np.std(timings)
    median = np.median(timings)

    split_results = {
        "percentiles": percentiles,
        "p01": percentiles[0],
        "p05": percentiles[4],
        "p95": percentiles[94],
        "p99": percentiles[98],
        "mean": mean,
        "std": std,
        "median": median,
        "total_time_s": dt,
        "job_size": job_size,
        "n_jobs": n_jobs,
        "n_threads": n_threads,
        "replace": replace,
        "requests_per_s": job_size * n_jobs / dt
    }

    return merge_results, split_results