Esempio n. 1
0
def main(argsv):
    import argparse

    parser = argparse.ArgumentParser(description='Run cirrocumulus')
    parser.add_argument(
        'dataset',
        help=
        'Path to dataset in h5ad, loom, Seurat, TileDB, zarr, or STAR-Fusion format. Separate multiple datasets with '
        'a comma instead of a space in order to join datasets by cell id',
        nargs='+')
    parser.add_argument('--spatial', help=SPATIAL_HELP, nargs='*')
    parser.add_argument(
        '--markers',
        help=
        'Path to JSON file that maps name to features. For example {"a":["gene1", "gene2"], "b":["gene3"]}',
        nargs='*')
    parser.add_argument(
        '--host', help='Host IP address'
    )  # set to 0.0.0.0 to make it accessible from other computers WITHOUT SECURITY.

    parser.add_argument('--port', help='Server port', default=5000, type=int)
    parser.add_argument('--no-open',
                        dest='no_open',
                        help='Do not open your web browser',
                        action='store_true')
    parser.add_argument(
        '--results',
        help='URL to save user computed results (e.g. differential expression)'
    )
    parser.add_argument('--ontology',
                        help='Path to ontology in OBO format for annotation')
    args = parser.parse_args(argsv)
    if args.results is not None:
        os.environ[CIRRO_JOB_RESULTS] = args.results
    else:
        os.environ[CIRRO_JOB_RESULTS] = os.path.join(
            os.path.dirname(args.dataset[0].rstrip('/')), 'results')
    get_fs(os.environ[CIRRO_JOB_RESULTS]).makedirs(
        os.environ[CIRRO_JOB_RESULTS], exist_ok=True)
    if args.ontology is not None:
        os.environ[CIRRO_CELL_ONTOLOGY] = args.ontology
    app = create_app()
    configure_app(app, args.dataset, args.spatial, args.markers)
    if not args.no_open:
        import webbrowser, requests
        host = args.host if args.host is not None else 'http://127.0.0.1'
        url = host + ':' + str(args.port)
        try:
            if requests.get(url).ok:
                import sys
                sys.exit('Address already in use')
        except:
            pass
        webbrowser.open(url)
    app.run(host=args.host, port=args.port, debug=False)
Esempio n. 2
0
 def get_result(self, dataset, result_id):
     path = dataset["url"]
     provider = self.get_dataset_provider(path)
     return provider.get_result(get_fs(path),
                                path,
                                dataset=dataset,
                                result_id=result_id)
Esempio n. 3
0
 def read_dataset(self, dataset, keys=[]):
     path = dataset["url"]
     provider = self.get_dataset_provider(path)
     return provider.read_dataset(get_fs(path),
                                  path,
                                  keys=keys,
                                  dataset=dataset)
Esempio n. 4
0
 def delete_job_result(self, result):
     if result is not None:
         if isinstance(result, dict) and result.get('url') is not None:
             fs = get_fs(result['url'])
             if fs.exists(result['url']):
                 fs.rm(result['url'], recursive=True)
         else:
             self.get_gridfs().delete(ObjectId(result))
Esempio n. 5
0
 def get_schema(self, dataset):
     path = dataset["url"]
     provider = self.get_dataset_provider(path)
     schema_dict = provider.get_schema(get_fs(path), path)
     if "summary" in dataset:
         schema_dict["summary"] = dataset["summary"]
     if "markers" in schema_dict:
         schema_dict["markers_read_only"] = schema_dict.pop("markers")
     return schema_dict
Esempio n. 6
0
 def get_dataset_info(self, dataset):
     dataset_id = dataset["id"]
     if self.cached_dataset_id == dataset_id:
         dataset_info = self.cached_dataset_info
     else:
         path = dataset["url"]
         provider = self.get_dataset_provider(path)
         dataset_info = provider.get_dataset_info(get_fs(path), path)
         self.cached_dataset_info = dataset_info
         self.cached_dataset_id = dataset_id
     return dataset_info
    def __init__(self, paths):
        super().__init__()
        self.dataset_to_info = {}  # json_data, meta, json_path
        self.job_id_to_job = {}
        if os.environ.get(CIRRO_JOB_RESULTS) is not None:  # load saved on disk
            fs = get_fs(os.environ[CIRRO_JOB_RESULTS])
            fs.makedirs(os.environ[CIRRO_JOB_RESULTS], exist_ok=True)
            for url in fs.ls(os.environ[CIRRO_JOB_RESULTS]):
                if url.lower().endswith(".json.gz"):
                    import gzip

                    with gzip.open(fs.open(url)) as f:
                        d = json.load(f)
                        if "id" in d:
                            d["url"] = url
                            self.job_id_to_job[d["id"]] = d
                elif url.lower().endswith(".json"):
                    with fs.open(url) as f:
                        d = json.load(f)
                        if "id" in d:
                            d["url"] = url
                            self.job_id_to_job[d["id"]] = d

        for path in paths:
            json_data = {}
            basename = os.path.splitext(path)[0]
            old_path = basename + "_filters.json"
            json_path = basename + ".json"
            if os.path.exists(old_path) and os.path.getsize(old_path) > 0:
                with open(old_path, "rt") as f:
                    json_data["filters"] = json.load(f)

            if os.path.exists(json_path) and os.path.getsize(json_path) > 0:
                with open(json_path, "rt") as f:
                    try:
                        json_data.update(json.load(f))
                    except:
                        print("Unable to load {}".format(json_path))
            meta = create_dataset_meta(path)
            if "filters" not in json_data:
                json_data["filters"] = {}
            if "views" not in json_data:
                json_data["views"] = {}
            if "categories" not in json_data:
                json_data["categories"] = {}
            self.dataset_to_info[path] = dict(json_data=json_data,
                                              meta=meta,
                                              json_path=json_path)
Esempio n. 8
0
    def __init__(self, paths):
        super().__init__()
        self.dataset_to_info = {}  # json_data, meta, json_path
        self.job_id_to_job = {}
        if os.environ.get(CIRRO_JOB_RESULTS) is not None:  # load saved on disk
            fs = get_fs(os.environ[CIRRO_JOB_RESULTS])
            fs.makedirs(os.environ[CIRRO_JOB_RESULTS], exist_ok=True)
            for url in fs.ls(os.environ[CIRRO_JOB_RESULTS]):
                if url.lower().endswith('.json.gz'):
                    import gzip
                    with gzip.open(fs.open(url)) as f:
                        d = json.load(f)
                        if 'id' in d:
                            d['url'] = url
                            self.job_id_to_job[d['id']] = d
                elif url.lower().endswith('.json'):
                    with fs.open(url) as f:
                        d = json.load(f)
                        if 'id' in d:
                            d['url'] = url
                            self.job_id_to_job[d['id']] = d

        for path in paths:
            json_data = {}
            basename = os.path.splitext(path)[0]
            old_path = basename + '_filters.json'
            json_path = basename + '.json'
            if os.path.exists(old_path) and os.path.getsize(old_path) > 0:
                with open(old_path, 'rt') as f:
                    json_data['filters'] = json.load(f)

            if os.path.exists(json_path) and os.path.getsize(json_path) > 0:
                with open(json_path, 'rt') as f:
                    try:
                        json_data.update(json.load(f))
                    except:
                        print('Unable to load {}'.format(json_path))
            meta = create_dataset_meta(path)
            if 'filters' not in json_data:
                json_data['filters'] = {}
            if 'views' not in json_data:
                json_data['views'] = {}
            if 'categories' not in json_data:
                json_data['categories'] = {}
            self.dataset_to_info[path] = dict(json_data=json_data,
                                              meta=meta,
                                              json_path=json_path)
def configure_app(app, list_of_dataset_paths, spatial_directories,
                  marker_paths):
    from cirrocumulus.api import dataset_api
    from cirrocumulus.no_auth import NoAuth

    try:
        from cirrocumulus.parquet_dataset import ParquetDataset

        dataset_api.add(ParquetDataset())
    except ModuleNotFoundError:
        pass
    try:
        from cirrocumulus.zarr_dataset import ZarrDataset

        dataset_api.add(ZarrDataset())
    except ModuleNotFoundError:
        pass
    app.config[CIRRO_AUTH] = NoAuth()
    os.environ[CIRRO_JOB_TYPE + "de"] = "cirrocumulus.job_api.run_de"
    anndata_dataset = AnndataDataset()
    dataset_ids = []
    for dataset_paths in list_of_dataset_paths:
        dataset_paths = dataset_paths.split(",")
        dataset_id = dataset_paths[0]
        dataset_ids.append(dataset_id)
        if len(dataset_paths) > 1:
            datasets = []
            for i in range(len(dataset_paths)):
                dataset = anndata_dataset.get_data(get_fs(dataset_paths[i]),
                                                   dataset_paths[i])
                if "group" not in dataset.var:
                    dataset.var["group"] = dataset.uns.get(
                        "name", "dataset {}".format(i + 1))
                datasets.append(dataset)
            adata = anndata.concat(datasets,
                                   axis=1,
                                   label="group",
                                   merge="unique")
            dataset.obsm = datasets[0].obsm
            adata.var.index = adata.var.index.str.replace("/", "_")
            adata.var_names_make_unique()
            anndata_dataset.add_data(dataset_id, adata)
        dataset_api.add(anndata_dataset)

    app.config[CIRRO_DATABASE] = LocalDbAPI(dataset_ids)

    if spatial_directories is not None and len(spatial_directories) > 0:
        for i in range(len(spatial_directories)):
            spatial_directory = spatial_directories[i]
            if spatial_directory != "":
                adata = anndata_dataset.get_data(get_fs(dataset_ids[i]),
                                                 dataset_ids[i])
                if not add_spatial(adata, spatial_directory):
                    print("No spatial data found in {}".format(
                        spatial_directory))

    if marker_paths is not None and len(marker_paths) > 0:
        markers = get_markers(marker_paths)
        for dataset_id in dataset_ids:
            d = anndata_dataset.get_data(get_fs(dataset_id), dataset_id)
            existing_markers = d.uns.get("markers", [])
            markers += existing_markers
            # remove genes in dict that are not in dataset
            d.uns["markers"] = filter_markers(d, markers)
def main(argsv):
    import argparse

    parser = argparse.ArgumentParser(description="Run cirrocumulus")
    parser.add_argument(
        "dataset",
        help=
        "Path to dataset in h5ad, loom, Seurat, TileDB, zarr, or STAR-Fusion format. Separate multiple datasets with "
        "a comma instead of a space in order to join datasets by cell id",
        nargs="+",
    )
    parser.add_argument("--spatial", help=SPATIAL_HELP, nargs="*")
    parser.add_argument(
        "--markers",
        help=
        'Path to JSON file that maps name to features. For example {"a":["gene1", "gene2"], "b":["gene3"]}',
        nargs="*",
    )
    parser.add_argument(
        "--host", help="Host IP address"
    )  # set to 0.0.0.0 to make it accessible from other computers WITHOUT SECURITY.

    parser.add_argument("--port", help="Server port", default=5000, type=int)
    parser.add_argument("--no-open",
                        dest="no_open",
                        help="Do not open your web browser",
                        action="store_true")
    parser.add_argument(
        "--results",
        help="URL to save user computed results (e.g. differential expression)"
    )
    parser.add_argument("--ontology",
                        help="Path to ontology in OBO format for annotation")
    args = parser.parse_args(argsv)
    if args.results is not None:
        os.environ[CIRRO_JOB_RESULTS] = args.results
    else:
        os.environ[CIRRO_JOB_RESULTS] = os.path.join(
            os.path.dirname(args.dataset[0].rstrip("/")), "results")
    get_fs(os.environ[CIRRO_JOB_RESULTS]).makedirs(
        os.environ[CIRRO_JOB_RESULTS], exist_ok=True)
    if args.ontology is not None:
        os.environ[CIRRO_CELL_ONTOLOGY] = args.ontology
    app = create_app()
    configure_app(app, args.dataset, args.spatial, args.markers)
    if not args.no_open:
        import webbrowser

        import requests

        host = args.host if args.host is not None else "http://127.0.0.1"
        url = host + ":" + str(args.port)
        try:
            if requests.get(url).ok:
                import sys

                sys.exit("Address already in use")
        except:
            pass
        webbrowser.open(url)
    from flask import cli

    cli.show_server_banner = lambda *_: None  # suppress warning message
    app.run(host=args.host, port=args.port, debug=False)
Esempio n. 11
0
def main(argsv):
    import argparse
    import os
    parser = argparse.ArgumentParser(description='Run cirrocumulus server')
    parser.add_argument('--db_uri',
                        help='Database connection URI',
                        default=DEFAULT_DB_URI)
    parser.add_argument('-w',
                        '--workers',
                        dest='workers',
                        help='The number of worker processes',
                        type=int)
    parser.add_argument(
        '-t',
        '--timeout',
        dest='timeout',
        help=
        'Workers silent for more than this many seconds are killed and restarted',
        type=int,
        default=30)
    parser.add_argument(
        '-b',
        '--bind',
        dest='bind',
        help=
        'Server socket to bind. Server sockets can be any of $(HOST), $(HOST):$(PORT), fd://$(FD), or unix:$(PATH). An IP is a valid $(HOST).',
        default='127.0.0.1:5000')
    parser.add_argument(
        '--footer', help='Markdown file to customize the application footer')
    parser.add_argument(
        '--header', help='Markdown file to customize the application header')
    parser.add_argument('--upload', help='URL to allow users to upload files')
    parser.add_argument(
        '--results',
        help=
        'URL to save user computed results (e.g. differential expression) to')
    parser.add_argument('--ontology',
                        help='Path to ontology in OBO format for annotation')
    args = parser.parse_args(argsv)

    bind = args.bind if args.bind is not None else '127.0.0.1:5000'
    if args.ontology is not None:
        os.environ[CIRRO_CELL_ONTOLOGY] = args.ontology
    os.environ[CIRRO_DB_URI] = args.db_uri

    if args.footer is not None:
        os.environ[CIRRO_FOOTER] = args.footer

    if args.header is not None:
        os.environ[CIRRO_BRAND] = args.header

    if args.workers is not None:
        workers = args.workers
    else:
        import os
        workers = 2 * os.cpu_count()
    if args.upload is not None:
        os.environ[CIRRO_UPLOAD] = args.upload
    if args.results is not None:
        os.environ[CIRRO_JOB_RESULTS] = args.results
        get_fs(os.environ[CIRRO_JOB_RESULTS]).makedirs(
            os.environ[CIRRO_JOB_RESULTS], exist_ok=True)

    run_args = [
        'gunicorn', '-b', bind, '-w',
        str(workers), '-t',
        str(args.timeout), '-n', 'cirrocumulus-webserver',
        'cirrocumulus.serve:cached_app()'
    ]
    # if args.gunicorn is not None:
    #     run_args += args.gunicorn.split(' ')
    import subprocess
    subprocess.check_call(run_args)
Esempio n. 12
0
def main(argsv):
    import os
    import argparse

    parser = argparse.ArgumentParser(description="Run cirrocumulus server")
    parser.add_argument("--db_uri",
                        help="Database connection URI",
                        default=DEFAULT_DB_URI)
    parser.add_argument("-w",
                        "--workers",
                        dest="workers",
                        help="The number of worker processes",
                        type=int)
    parser.add_argument(
        "-t",
        "--timeout",
        dest="timeout",
        help=
        "Workers silent for more than this many seconds are killed and restarted",
        type=int,
        default=30,
    )
    parser.add_argument(
        "-b",
        "--bind",
        dest="bind",
        help=
        "Server socket to bind. Server sockets can be any of $(HOST), $(HOST):$(PORT), fd://$(FD), or unix:$(PATH). An IP is a valid $(HOST).",
        default="127.0.0.1:5000",
    )
    parser.add_argument(
        "--footer", help="Markdown file to customize the application footer")
    parser.add_argument(
        "--header", help="Markdown file to customize the application header")
    parser.add_argument("--upload", help="URL to allow users to upload files")
    parser.add_argument(
        "--results",
        help=
        "URL to save user computed results (e.g. differential expression) to")
    parser.add_argument("--ontology",
                        help="Path to ontology in OBO format for annotation")
    args = parser.parse_args(argsv)

    bind = args.bind if args.bind is not None else "127.0.0.1:5000"
    if args.ontology is not None:
        os.environ[CIRRO_CELL_ONTOLOGY] = args.ontology
    os.environ[CIRRO_DB_URI] = args.db_uri

    if args.footer is not None:
        os.environ[CIRRO_FOOTER] = args.footer

    if args.header is not None:
        os.environ[CIRRO_BRAND] = args.header

    if args.workers is not None:
        workers = args.workers
    else:
        import os

        workers = 2 * os.cpu_count()
    if args.upload is not None:
        os.environ[CIRRO_UPLOAD] = args.upload
    if args.results is not None:
        os.environ[CIRRO_JOB_RESULTS] = args.results
        get_fs(os.environ[CIRRO_JOB_RESULTS]).makedirs(
            os.environ[CIRRO_JOB_RESULTS], exist_ok=True)

    run_args = [
        "gunicorn",
        "-b",
        bind,
        "-w",
        str(workers),
        "-t",
        str(args.timeout),
        "-n",
        "cirrocumulus-webserver",
        "cirrocumulus.serve:cached_app()",
    ]
    # if args.gunicorn is not None:
    #     run_args += args.gunicorn.split(' ')
    import subprocess

    subprocess.check_call(run_args)
Esempio n. 13
0
    def execute(self):
        output_format = self.output_format
        dataset = self.dataset
        if self.groups is None and not self.no_auto_groups:
            groups = []
            existing_fields = set()
            scanpy_marker_keys = get_scanpy_marker_keys(dataset)
            for key in scanpy_marker_keys:
                group_by = dataset.uns[key]['params']['groupby']
                if isinstance(group_by, np.ndarray):
                    group_by = ','.join(group_by)
                existing_fields.add(group_by)
            for field in dataset.obs.columns:
                field_lc = field.lower()
                for cluster_field in cluster_fields:
                    if field_lc.find(
                            cluster_field
                    ) != -1 and cluster_field not in existing_fields:
                        groups.append(field)
                        break
            self.groups = groups
        if self.groups is not None and len(self.groups) > 0:
            use_pegasus = False
            use_scanpy = False
            try:
                import pegasus as pg
                use_pegasus = True
            except ModuleNotFoundError:
                pass
            if not use_pegasus:
                try:
                    import scanpy as sc
                    use_scanpy = True
                except ModuleNotFoundError:
                    pass
            if not use_pegasus and not use_scanpy:
                raise ValueError(
                    'Please install pegasuspy or scanpy to compute markers')

            for group in self.groups:
                field = group
                if group not in dataset.obs:  # test if multiple comma separated fields
                    split_groups = group.split(',')
                    if len(split_groups) > 1:
                        use_split_groups = True
                        for split_group in split_groups:
                            if split_group not in dataset.obs:
                                use_split_groups = False
                                break
                        if use_split_groups:
                            dataset.obs[field] = dataset.obs[
                                split_groups[0]].str.cat(
                                    dataset.obs[split_groups[1:]], sep=',')

                if field in dataset.obs:
                    if not pd.api.types.is_categorical_dtype(
                            dataset.obs[field]):
                        dataset.obs[field] = dataset.obs[field].astype(
                            str).astype('category')
                    if len(dataset.obs[field].cat.categories) > 1:
                        logger.info('Computing markers for {}'.format(field))
                        key_added = 'rank_genes_' + str(field)
                        if use_pegasus:
                            pg.de_analysis(dataset,
                                           cluster=field,
                                           de_key=key_added)
                        else:
                            sc.tl.rank_genes_groups(dataset,
                                                    field,
                                                    key_added=key_added,
                                                    method='t-test')
                else:
                    logger.info(group + ' not found in ' +
                                ', '.join(dataset.obs.columns))
        schema = self.get_schema()
        schema['format'] = output_format
        if output_format in ['parquet', 'zarr']:
            output_dir = self.base_output
        else:
            output_dir = os.path.splitext(self.base_output)[0]
        filesystem = get_fs(output_dir)
        filesystem.makedirs(output_dir, exist_ok=True)
        results = schema.get('results', [])

        if len(results) > 0:
            uns_dir = os.path.join(output_dir, 'uns')
            is_gzip = output_format != 'jsonl'
            filesystem.makedirs(uns_dir, exist_ok=True)

            for i in range(len(results)):
                full_result = results[i]
                result_id = full_result.pop('id')
                # keep id, name, type in schema, store rest externally
                results[i] = dict(id=result_id,
                                  name=full_result.pop('name'),
                                  type=full_result.pop('type'),
                                  content_type='application/json',
                                  content_encoding='gzip' if is_gzip else None)
                json_result = to_json(full_result)

                result_path = os.path.join(
                    uns_dir, result_id +
                    '.json.gz') if is_gzip else os.path.join(
                        uns_dir, result_id + '.json')
                with open_file(result_path,
                               'wt',
                               compression='gzip' if is_gzip else None) as out:
                    out.write(json_result)
        images = dataset.uns.get('images')
        if images is not None:
            image_dir = os.path.join(output_dir, 'images')
            filesystem.makedirs(image_dir, exist_ok=True)
            for image in images:
                src = image['image']
                dest = os.path.join(image_dir, os.path.basename(src))
                filesystem.copy(src, dest)
                image['image'] = 'images/' + os.path.basename(src)

        if output_format == 'parquet':
            from cirrocumulus.parquet_output import save_dataset_pq
            save_dataset_pq(dataset, schema, self.base_output, filesystem,
                            self.save_whitelist)
        elif output_format == 'jsonl':
            from cirrocumulus.jsonl_io import save_dataset_jsonl
            save_dataset_jsonl(dataset, schema, output_dir, self.base_output,
                               filesystem)
        elif output_format == 'zarr':
            from cirrocumulus.zarr_output import save_dataset_zarr
            save_dataset_zarr(dataset, schema, self.base_output, filesystem,
                              self.save_whitelist)
        else:
            raise ValueError("Unknown format")
    def execute(self):
        output_format = self.output_format
        dataset = self.dataset
        if self.groups is None and not self.no_auto_groups:
            groups = []
            existing_fields = set()
            scanpy_marker_keys = get_scanpy_marker_keys(dataset)
            for key in scanpy_marker_keys:
                group_by = dataset.uns[key]["params"]["groupby"]
                if isinstance(group_by, np.ndarray):
                    group_by = ",".join(group_by)
                existing_fields.add(group_by)
            for field in dataset.obs.columns:
                field_lc = field.lower()
                for cluster_field in cluster_fields:
                    if field_lc.find(cluster_field) != -1 and cluster_field not in existing_fields:
                        groups.append(field)
                        break

            self.groups = groups
        if self.groups is not None and len(self.groups) > 0:
            use_pegasus = False
            use_scanpy = False
            try:
                import pegasus as pg

                use_pegasus = True
            except ModuleNotFoundError:
                pass
            if not use_pegasus:
                try:
                    import scanpy as sc

                    use_scanpy = True
                    if "log1p" not in dataset.uns:
                        dataset.uns["log1p"] = {}
                    if "base" not in dataset.uns["log1p"]:
                        dataset.uns["log1p"]["base"] = None
                except ModuleNotFoundError:
                    pass
            if not use_pegasus and not use_scanpy:
                raise ValueError("Please install pegasuspy or scanpy to compute markers")
            first_time = True

            for group in self.groups:
                field = group
                if group not in dataset.obs:  # test if multiple comma separated fields
                    split_groups = group.split(",")
                    if len(split_groups) > 1:
                        use_split_groups = True
                        for split_group in split_groups:
                            if split_group not in dataset.obs:
                                use_split_groups = False
                                break
                        if use_split_groups:
                            dataset.obs[field] = dataset.obs[split_groups[0]].str.cat(
                                dataset.obs[split_groups[1:]], sep=","
                            )

                if field in dataset.obs:
                    if not pd.api.types.is_categorical_dtype(dataset.obs[field]):
                        dataset.obs[field] = dataset.obs[field].astype(str).astype("category")
                    if len(dataset.obs[field].cat.categories) > 1:
                        key_added = "rank_genes_" + str(field)
                        value_counts = dataset.obs[field].value_counts()
                        filtered_value_counts = value_counts[value_counts >= 3]
                        if len(filtered_value_counts) >= 2:
                            if first_time:
                                logger.info(
                                    "Using {} to compute markers".format(
                                        "pegasuspy" if use_pegasus else "scanpy"
                                    )
                                )
                                first_time = False
                            logger.info("Computing markers for {}".format(field))
                            if use_pegasus:
                                pg.de_analysis(
                                    dataset,
                                    cluster=field,
                                    de_key=key_added,
                                    subset=filtered_value_counts.index.to_list(),
                                )
                            else:
                                sc.tl.rank_genes_groups(
                                    dataset,
                                    field,
                                    key_added=key_added,
                                    method="t-test",
                                    groups=filtered_value_counts.index.to_list(),
                                )
                else:
                    raise ValueError(group + " not found in " + ", ".join(dataset.obs.columns))
        schema = self.get_schema()
        schema["format"] = output_format
        if output_format in ["parquet", "zarr"]:
            output_dir = self.base_output
        else:
            output_dir = os.path.splitext(self.base_output)[0]
        filesystem = get_fs(output_dir)
        filesystem.makedirs(output_dir, exist_ok=True)
        results = schema.get("results", [])

        if len(results) > 0:
            uns_dir = os.path.join(output_dir, "uns")
            is_gzip = output_format != "jsonl"
            filesystem.makedirs(uns_dir, exist_ok=True)

            for i in range(len(results)):
                full_result = results[i]
                result_id = full_result.pop("id")
                # keep id, name, type in schema, store rest externally
                results[i] = dict(
                    id=result_id,
                    name=full_result.pop("name"),
                    type=full_result.pop("type"),
                    content_type="application/json",
                    content_encoding="gzip" if is_gzip else None,
                )
                json_result = to_json(full_result)

                result_path = (
                    os.path.join(uns_dir, result_id + ".json.gz")
                    if is_gzip
                    else os.path.join(uns_dir, result_id + ".json")
                )
                with open_file(result_path, "wt", compression="gzip" if is_gzip else None) as out:
                    out.write(json_result)
        images = dataset.uns.pop("images", None)
        if images is not None:
            image_dir = os.path.join(output_dir, "images")
            filesystem.makedirs(image_dir, exist_ok=True)
            for image in images:
                src = image["image"]
                dest = os.path.join(image_dir, os.path.basename(src))
                filesystem.copy(src, dest)
                image["image"] = "images/" + os.path.basename(src)

        if output_format == "parquet":
            from cirrocumulus.parquet_output import save_dataset_pq

            save_dataset_pq(dataset, schema, self.base_output, filesystem, self.save_whitelist)
        elif output_format == "jsonl":
            from cirrocumulus.jsonl_io import save_dataset_jsonl

            save_dataset_jsonl(dataset, schema, output_dir, self.base_output, filesystem)
        elif output_format == "zarr":
            from cirrocumulus.zarr_output import save_dataset_zarr

            save_dataset_zarr(dataset, schema, self.base_output, filesystem, self.save_whitelist)
        else:
            raise ValueError("Unknown format")