def segment_to_body_mapping_from_edge_csv(csv_path, output_csv_path=None):
    """
    Load and return the a segment-to-body mapping (a.k.a. an "equivalence_mapping" in Brainmaps terminology,
    from the given csv_path of equivalence edges (or complete merge graph).

    That is, compute the groups "connected components" of the graph,
    and map each node to its owning group.
    
    Each row represents an edge. For example:
    
        123,456
        123,789
        789,234
        
    The CSV file may optionally contain a header row.
    Also, it may contain more than two columns, but only the first two columns are used.
    
    Args:
        csv_path:
            Path to a csv file whose first two columns are edge pairs
        
        output_csv_path:
            (Optional.) If provided, also write the results to a CSV file.
        
    Returns:
        ndarray with two columns representing node and group

    Note: The returned array is NOT merely the parsed CSV.
          It has been transformed from equivalence edges to node mappings,
          via a connected components step.
    """
    edges = load_edge_csv(csv_path)
    mapping = mapping_from_edges(edges)

    if output_csv_path:
        equivalence_mapping_to_csv(mapping, output_csv_path)

    return mapping
def segment_to_body_mapping_from_edge_csv(csv_path, output_csv_path=None):
    """
    Load and return the a segment-to-body mapping (a.k.a. an "equivalence_mapping" in Brainmaps terminology,
    from the given csv_path of equivalence edges (or complete merge graph).

    That is, compute the groups "connected components" of the graph,
    and map each node to its owning group.
    
    Each row represents an edge. For example:
    
        123,456
        123,789
        789,234
        
    The CSV file may optionally contain a header row.
    Also, it may contain more than two columns, but only the first two columns are used.
    
    Args:
        csv_path:
            Path to a csv file whose first two columns are edge pairs
        
        output_csv_path:
            (Optional.) If provided, also write the results to a CSV file.
        
    Returns:
        ndarray with two columns representing node and group

    Note: The returned array is NOT merely the parsed CSV.
          It has been transformed from equivalence edges to node mappings,
          via a connected components step.
    """
    edges = load_edge_csv(csv_path)
    mapping = mapping_from_edges(edges)
    
    if output_csv_path:
        equivalence_mapping_to_csv(mapping, output_csv_path)
        
    return mapping
Beispiel #3
0
def main_impl(args):
    # Read agglomeration file
    segment_to_body_df = None
    if args.agglomeration_mapping:
        with Timer("Loading agglomeration mapping", logger):
            if args.agglomeration_mapping.endswith('.csv'):
                mapping_pairs = load_edge_csv(args.agglomeration_mapping)
                segment_to_body_df = pd.DataFrame(mapping_pairs, columns=AGGLO_MAP_COLUMNS)
            elif args.agglomeration_mapping.endswith('.npy'):
                mapping_pairs = np.load(args.agglomeration_mapping)
                # Accept either a (N,2) array or an (N,) record array
                if mapping_pairs.ndim == 2 and mapping_pairs.shape[1] == 2:
                    segment_to_body_df = pd.DataFrame(mapping_pairs, columns=AGGLO_MAP_COLUMNS)
                elif mapping_pairs.ndim == 1:
                    segment_to_body_df = pd.DataFrame(mapping_pairs)
                    assert segment_to_body_df.columns.tolist() == AGGLO_MAP_COLUMNS, \
                        f"mapping given in {args.agglomeration_mapping} has the wrong column names."
                else:
                    raise RuntimeError(f"Did not understand mapping file: {args.agglomeration_mapping}")
            else:
                if set(args.agglomeration_mapping) - set('0123456789abcdef'):
                    raise RuntimeError(f"Your agglomeration mapping is neither a CSV file nor a UUID: {args.agglomeration_mapping}")

                mapping_uuid = args.agglomeration_mapping
                logger.info(f"Loading agglomeration mapping from UUID {mapping_uuid}")
                mapping_series = fetch_complete_mappings(args.server, mapping_uuid, args.labelmap_instance)
                segment_to_body_df = pd.DataFrame( {'segment_id': mapping_series.index.values} )
                segment_to_body_df['body_id'] = mapping_series.values
                assert (segment_to_body_df.columns == AGGLO_MAP_COLUMNS).all()

    subset_labels = None
    if args.subset_labels:
        is_supervoxels = (args.agglomeration_mapping is None)
        subset_labels = load_body_list(args.subset_labels, is_supervoxels)
        subset_labels = set(subset_labels)

    if args.last_mutid is None:
        args.last_mutid = fetch_repo_info(args.server, args.uuid)['MutationID']

    # Upload label indexes
    if args.operation in ('indexes', 'both', 'sort-only'):
        if not args.supervoxel_block_stats_h5:
            raise RuntimeError("You must provide a supervoxel_block_stats_h5 file if you want to ingest LabelIndexes")

        # Read block stats file
        block_sv_stats, presorted_by, agglomeration_path = load_stats_h5_to_records(args.supervoxel_block_stats_h5)
        
        stats_are_presorted = False
        if args.agglomeration_mapping:
            if (presorted_by == 'body_id') and (agglomeration_path == args.agglomeration_mapping):
                stats_are_presorted = True
        elif presorted_by == 'segment_id':
            stats_are_presorted = True
        
        if stats_are_presorted:
            logger.info("Stats are pre-sorted")
        else:
            output_dir, basename = os.path.split(os.path.abspath(args.supervoxel_block_stats_h5))
            if segment_to_body_df is None:
                output_path = output_dir + '/sorted-by-segment-' + basename
            else:
                output_path = output_dir + '/sorted-by-body-' +  basename
            sort_block_stats(block_sv_stats, segment_to_body_df, output_path, args.agglomeration_mapping)
    
        if args.operation == 'sort-only':
            return

        with Timer(f"Grouping {len(block_sv_stats)} blockwise supervoxel counts and loading LabelIndices", logger):
            ingest_label_indexes( args.server,
                                  args.uuid,
                                  args.labelmap_instance,
                                  args.last_mutid,
                                  block_sv_stats,
                                  subset_labels,
                                  args.tombstones,
                                  batch_rows=args.batch_size,
                                  num_threads=args.num_threads,
                                  check_mismatches=args.check_mismatches )

    # Upload mappings
    if args.operation in ('mappings', 'both'):
        if not args.agglomeration_mapping:
            raise RuntimeError("Can't load mappings without an agglomeration-mapping file.")
        
        with Timer(f"Loading mapping ops", logger):
            ingest_mapping( args.server,
                            args.uuid,
                            args.labelmap_instance,
                            args.last_mutid,
                            segment_to_body_df,
                            subset_labels,
                            args.batch_size )
def load_labelmap(labelmap_config, working_dir):
    """
    Load a labelmap file as specified in the given labelmap_config,
    which must conform to LabelMapSchema.
    
    If the labelmapfile exists on gbuckets, it will be downloaded first.
    If it is gzip-compressed, it will be unpacked.
    
    The final downloaded/uncompressed file will be saved into working_dir,
    and the final path will be overwritten in the labelmap_config.
    """
    path = labelmap_config["file"]

    # path is [gs://][/]path/to/file.csv[.gz]

    # If the file is in a gbucket, download it first (if necessary)
    if path.startswith('gs://'):
        filename = path.split('/')[-1]
        downloaded_path = working_dir + '/' + filename
        if not os.path.exists(downloaded_path):
            cmd = f'gsutil -q cp {path} {downloaded_path}'
            logger.info(cmd)
            subprocess.check_call(cmd, shell=True)
        path = downloaded_path

    if not labelmap_config["file"].startswith("/"):
        path = os.path.normpath(
            os.path.join(working_dir, labelmap_config["file"]))

    # Now path is /path/to/file.csv[.gz]

    if not os.path.exists(path) and os.path.exists(path + '.gz'):
        path = path + '.gz'

    # If the file is compressed, decompress it
    if os.path.splitext(path)[1] == '.gz':
        uncompressed_path = path[:-3]  # drop '.gz'
        if not os.path.exists(uncompressed_path):
            subprocess.check_call(f"gunzip {path}", shell=True)
            assert os.path.exists(uncompressed_path), \
                "Tried to uncompress the labelmap CSV file... where did it go?"
        path = uncompressed_path  # drop '.gz'

    # Now path is /path/to/file.csv
    # Overwrite the final downloaded/upacked location
    labelmap_config['file'] = path

    # Mapping is only loaded into numpy once, on the driver
    if labelmap_config["file-type"] == "label-to-body":
        logger.info(f"Loading label-to-body mapping from {path}")
        with Timer("Loading mapping", logger):
            mapping_pairs = load_edge_csv(path)

    elif labelmap_config["file-type"] in ("equivalence-edges", "body-rag"):
        logger.info(f"Loading equivalence mapping from {path}")
        with Timer("Loading mapping", logger):
            mapping_pairs = segment_to_body_mapping_from_edge_csv(path)

        # Export mapping to disk in case anyone wants to view it later
        output_dir, basename = os.path.split(path)
        mapping_csv_path = f'{output_dir}/LABEL-TO-BODY-{basename}'
        if not os.path.exists(mapping_csv_path):
            with open(mapping_csv_path, 'w') as f:
                csv.writer(f).writerows(mapping_pairs)
    else:
        raise RuntimeError(
            f"Unknown labelmap file-type: {labelmap_config['file-type']}")

    return mapping_pairs
def load_labelmap(labelmap_config, working_dir):
    """
    Load a labelmap file as specified in the given labelmap_config,
    which must conform to LabelMapSchema.
    
    If the labelmapfile exists on gbuckets, it will be downloaded first.
    If it is gzip-compressed, it will be unpacked.
    
    The final downloaded/uncompressed file will be saved into working_dir,
    and the final path will be overwritten in the labelmap_config.
    """
    path = labelmap_config["file"]

    # path is [gs://][/]path/to/file.csv[.gz]

    # If the file is in a gbucket, download it first (if necessary)
    if path.startswith('gs://'):
        filename = path.split('/')[-1]
        downloaded_path = working_dir + '/' + filename
        if not os.path.exists(downloaded_path):
            cmd = f'gsutil -q cp {path} {downloaded_path}'
            logger.info(cmd)
            subprocess.check_call(cmd, shell=True)
        path = downloaded_path

    if not labelmap_config["file"].startswith("/"):
        path = os.path.normpath( os.path.join(working_dir, labelmap_config["file"]) )

    # Now path is /path/to/file.csv[.gz]
    
    if not os.path.exists(path) and os.path.exists(path + '.gz'):
        path = path + '.gz'

    # If the file is compressed, decompress it
    if os.path.splitext(path)[1] == '.gz':
        uncompressed_path = path[:-3] # drop '.gz'
        if not os.path.exists(uncompressed_path):
            subprocess.check_call(f"gunzip {path}", shell=True)
            assert os.path.exists(uncompressed_path), \
                "Tried to uncompress the labelmap CSV file... where did it go?"
        path = uncompressed_path # drop '.gz'

    # Now path is /path/to/file.csv
    # Overwrite the final downloaded/upacked location
    labelmap_config['file'] = path

    # Mapping is only loaded into numpy once, on the driver
    if labelmap_config["file-type"] == "label-to-body":
        logger.info(f"Loading label-to-body mapping from {path}")
        with Timer("Loading mapping", logger):
            mapping_pairs = load_edge_csv(path)

    elif labelmap_config["file-type"] in ("equivalence-edges", "body-rag"):
        logger.info(f"Loading equivalence mapping from {path}")
        with Timer("Loading mapping", logger):
            mapping_pairs = segment_to_body_mapping_from_edge_csv(path)

        # Export mapping to disk in case anyone wants to view it later
        output_dir, basename = os.path.split(path)
        mapping_csv_path = f'{output_dir}/LABEL-TO-BODY-{basename}'
        if not os.path.exists(mapping_csv_path):
            with open(mapping_csv_path, 'w') as f:
                csv.writer(f).writerows(mapping_pairs)
    else:
        raise RuntimeError(f"Unknown labelmap file-type: {labelmap_config['file-type']}")

    return mapping_pairs