def segment_to_body_mapping_from_edge_csv(csv_path, output_csv_path=None): """ Load and return the a segment-to-body mapping (a.k.a. an "equivalence_mapping" in Brainmaps terminology, from the given csv_path of equivalence edges (or complete merge graph). That is, compute the groups "connected components" of the graph, and map each node to its owning group. Each row represents an edge. For example: 123,456 123,789 789,234 The CSV file may optionally contain a header row. Also, it may contain more than two columns, but only the first two columns are used. Args: csv_path: Path to a csv file whose first two columns are edge pairs output_csv_path: (Optional.) If provided, also write the results to a CSV file. Returns: ndarray with two columns representing node and group Note: The returned array is NOT merely the parsed CSV. It has been transformed from equivalence edges to node mappings, via a connected components step. """ edges = load_edge_csv(csv_path) mapping = mapping_from_edges(edges) if output_csv_path: equivalence_mapping_to_csv(mapping, output_csv_path) return mapping
def main_impl(args): # Read agglomeration file segment_to_body_df = None if args.agglomeration_mapping: with Timer("Loading agglomeration mapping", logger): if args.agglomeration_mapping.endswith('.csv'): mapping_pairs = load_edge_csv(args.agglomeration_mapping) segment_to_body_df = pd.DataFrame(mapping_pairs, columns=AGGLO_MAP_COLUMNS) elif args.agglomeration_mapping.endswith('.npy'): mapping_pairs = np.load(args.agglomeration_mapping) # Accept either a (N,2) array or an (N,) record array if mapping_pairs.ndim == 2 and mapping_pairs.shape[1] == 2: segment_to_body_df = pd.DataFrame(mapping_pairs, columns=AGGLO_MAP_COLUMNS) elif mapping_pairs.ndim == 1: segment_to_body_df = pd.DataFrame(mapping_pairs) assert segment_to_body_df.columns.tolist() == AGGLO_MAP_COLUMNS, \ f"mapping given in {args.agglomeration_mapping} has the wrong column names." else: raise RuntimeError(f"Did not understand mapping file: {args.agglomeration_mapping}") else: if set(args.agglomeration_mapping) - set('0123456789abcdef'): raise RuntimeError(f"Your agglomeration mapping is neither a CSV file nor a UUID: {args.agglomeration_mapping}") mapping_uuid = args.agglomeration_mapping logger.info(f"Loading agglomeration mapping from UUID {mapping_uuid}") mapping_series = fetch_complete_mappings(args.server, mapping_uuid, args.labelmap_instance) segment_to_body_df = pd.DataFrame( {'segment_id': mapping_series.index.values} ) segment_to_body_df['body_id'] = mapping_series.values assert (segment_to_body_df.columns == AGGLO_MAP_COLUMNS).all() subset_labels = None if args.subset_labels: is_supervoxels = (args.agglomeration_mapping is None) subset_labels = load_body_list(args.subset_labels, is_supervoxels) subset_labels = set(subset_labels) if args.last_mutid is None: args.last_mutid = fetch_repo_info(args.server, args.uuid)['MutationID'] # Upload label indexes if args.operation in ('indexes', 'both', 'sort-only'): if not args.supervoxel_block_stats_h5: raise RuntimeError("You must provide a supervoxel_block_stats_h5 file if you want to ingest LabelIndexes") # Read block stats file block_sv_stats, presorted_by, agglomeration_path = load_stats_h5_to_records(args.supervoxel_block_stats_h5) stats_are_presorted = False if args.agglomeration_mapping: if (presorted_by == 'body_id') and (agglomeration_path == args.agglomeration_mapping): stats_are_presorted = True elif presorted_by == 'segment_id': stats_are_presorted = True if stats_are_presorted: logger.info("Stats are pre-sorted") else: output_dir, basename = os.path.split(os.path.abspath(args.supervoxel_block_stats_h5)) if segment_to_body_df is None: output_path = output_dir + '/sorted-by-segment-' + basename else: output_path = output_dir + '/sorted-by-body-' + basename sort_block_stats(block_sv_stats, segment_to_body_df, output_path, args.agglomeration_mapping) if args.operation == 'sort-only': return with Timer(f"Grouping {len(block_sv_stats)} blockwise supervoxel counts and loading LabelIndices", logger): ingest_label_indexes( args.server, args.uuid, args.labelmap_instance, args.last_mutid, block_sv_stats, subset_labels, args.tombstones, batch_rows=args.batch_size, num_threads=args.num_threads, check_mismatches=args.check_mismatches ) # Upload mappings if args.operation in ('mappings', 'both'): if not args.agglomeration_mapping: raise RuntimeError("Can't load mappings without an agglomeration-mapping file.") with Timer(f"Loading mapping ops", logger): ingest_mapping( args.server, args.uuid, args.labelmap_instance, args.last_mutid, segment_to_body_df, subset_labels, args.batch_size )
def load_labelmap(labelmap_config, working_dir): """ Load a labelmap file as specified in the given labelmap_config, which must conform to LabelMapSchema. If the labelmapfile exists on gbuckets, it will be downloaded first. If it is gzip-compressed, it will be unpacked. The final downloaded/uncompressed file will be saved into working_dir, and the final path will be overwritten in the labelmap_config. """ path = labelmap_config["file"] # path is [gs://][/]path/to/file.csv[.gz] # If the file is in a gbucket, download it first (if necessary) if path.startswith('gs://'): filename = path.split('/')[-1] downloaded_path = working_dir + '/' + filename if not os.path.exists(downloaded_path): cmd = f'gsutil -q cp {path} {downloaded_path}' logger.info(cmd) subprocess.check_call(cmd, shell=True) path = downloaded_path if not labelmap_config["file"].startswith("/"): path = os.path.normpath( os.path.join(working_dir, labelmap_config["file"])) # Now path is /path/to/file.csv[.gz] if not os.path.exists(path) and os.path.exists(path + '.gz'): path = path + '.gz' # If the file is compressed, decompress it if os.path.splitext(path)[1] == '.gz': uncompressed_path = path[:-3] # drop '.gz' if not os.path.exists(uncompressed_path): subprocess.check_call(f"gunzip {path}", shell=True) assert os.path.exists(uncompressed_path), \ "Tried to uncompress the labelmap CSV file... where did it go?" path = uncompressed_path # drop '.gz' # Now path is /path/to/file.csv # Overwrite the final downloaded/upacked location labelmap_config['file'] = path # Mapping is only loaded into numpy once, on the driver if labelmap_config["file-type"] == "label-to-body": logger.info(f"Loading label-to-body mapping from {path}") with Timer("Loading mapping", logger): mapping_pairs = load_edge_csv(path) elif labelmap_config["file-type"] in ("equivalence-edges", "body-rag"): logger.info(f"Loading equivalence mapping from {path}") with Timer("Loading mapping", logger): mapping_pairs = segment_to_body_mapping_from_edge_csv(path) # Export mapping to disk in case anyone wants to view it later output_dir, basename = os.path.split(path) mapping_csv_path = f'{output_dir}/LABEL-TO-BODY-{basename}' if not os.path.exists(mapping_csv_path): with open(mapping_csv_path, 'w') as f: csv.writer(f).writerows(mapping_pairs) else: raise RuntimeError( f"Unknown labelmap file-type: {labelmap_config['file-type']}") return mapping_pairs
def load_labelmap(labelmap_config, working_dir): """ Load a labelmap file as specified in the given labelmap_config, which must conform to LabelMapSchema. If the labelmapfile exists on gbuckets, it will be downloaded first. If it is gzip-compressed, it will be unpacked. The final downloaded/uncompressed file will be saved into working_dir, and the final path will be overwritten in the labelmap_config. """ path = labelmap_config["file"] # path is [gs://][/]path/to/file.csv[.gz] # If the file is in a gbucket, download it first (if necessary) if path.startswith('gs://'): filename = path.split('/')[-1] downloaded_path = working_dir + '/' + filename if not os.path.exists(downloaded_path): cmd = f'gsutil -q cp {path} {downloaded_path}' logger.info(cmd) subprocess.check_call(cmd, shell=True) path = downloaded_path if not labelmap_config["file"].startswith("/"): path = os.path.normpath( os.path.join(working_dir, labelmap_config["file"]) ) # Now path is /path/to/file.csv[.gz] if not os.path.exists(path) and os.path.exists(path + '.gz'): path = path + '.gz' # If the file is compressed, decompress it if os.path.splitext(path)[1] == '.gz': uncompressed_path = path[:-3] # drop '.gz' if not os.path.exists(uncompressed_path): subprocess.check_call(f"gunzip {path}", shell=True) assert os.path.exists(uncompressed_path), \ "Tried to uncompress the labelmap CSV file... where did it go?" path = uncompressed_path # drop '.gz' # Now path is /path/to/file.csv # Overwrite the final downloaded/upacked location labelmap_config['file'] = path # Mapping is only loaded into numpy once, on the driver if labelmap_config["file-type"] == "label-to-body": logger.info(f"Loading label-to-body mapping from {path}") with Timer("Loading mapping", logger): mapping_pairs = load_edge_csv(path) elif labelmap_config["file-type"] in ("equivalence-edges", "body-rag"): logger.info(f"Loading equivalence mapping from {path}") with Timer("Loading mapping", logger): mapping_pairs = segment_to_body_mapping_from_edge_csv(path) # Export mapping to disk in case anyone wants to view it later output_dir, basename = os.path.split(path) mapping_csv_path = f'{output_dir}/LABEL-TO-BODY-{basename}' if not os.path.exists(mapping_csv_path): with open(mapping_csv_path, 'w') as f: csv.writer(f).writerows(mapping_pairs) else: raise RuntimeError(f"Unknown labelmap file-type: {labelmap_config['file-type']}") return mapping_pairs