def main(argsv): import argparse parser = argparse.ArgumentParser(description='Run cirrocumulus') parser.add_argument( 'dataset', help= 'Path to dataset in h5ad, loom, Seurat, TileDB, zarr, or STAR-Fusion format. Separate multiple datasets with ' 'a comma instead of a space in order to join datasets by cell id', nargs='+') parser.add_argument('--spatial', help=SPATIAL_HELP, nargs='*') parser.add_argument( '--markers', help= 'Path to JSON file that maps name to features. For example {"a":["gene1", "gene2"], "b":["gene3"]}', nargs='*') parser.add_argument( '--host', help='Host IP address' ) # set to 0.0.0.0 to make it accessible from other computers WITHOUT SECURITY. parser.add_argument('--port', help='Server port', default=5000, type=int) parser.add_argument('--no-open', dest='no_open', help='Do not open your web browser', action='store_true') parser.add_argument( '--results', help='URL to save user computed results (e.g. differential expression)' ) parser.add_argument('--ontology', help='Path to ontology in OBO format for annotation') args = parser.parse_args(argsv) if args.results is not None: os.environ[CIRRO_JOB_RESULTS] = args.results else: os.environ[CIRRO_JOB_RESULTS] = os.path.join( os.path.dirname(args.dataset[0].rstrip('/')), 'results') get_fs(os.environ[CIRRO_JOB_RESULTS]).makedirs( os.environ[CIRRO_JOB_RESULTS], exist_ok=True) if args.ontology is not None: os.environ[CIRRO_CELL_ONTOLOGY] = args.ontology app = create_app() configure_app(app, args.dataset, args.spatial, args.markers) if not args.no_open: import webbrowser, requests host = args.host if args.host is not None else 'http://127.0.0.1' url = host + ':' + str(args.port) try: if requests.get(url).ok: import sys sys.exit('Address already in use') except: pass webbrowser.open(url) app.run(host=args.host, port=args.port, debug=False)
def get_result(self, dataset, result_id): path = dataset["url"] provider = self.get_dataset_provider(path) return provider.get_result(get_fs(path), path, dataset=dataset, result_id=result_id)
def read_dataset(self, dataset, keys=[]): path = dataset["url"] provider = self.get_dataset_provider(path) return provider.read_dataset(get_fs(path), path, keys=keys, dataset=dataset)
def delete_job_result(self, result): if result is not None: if isinstance(result, dict) and result.get('url') is not None: fs = get_fs(result['url']) if fs.exists(result['url']): fs.rm(result['url'], recursive=True) else: self.get_gridfs().delete(ObjectId(result))
def get_schema(self, dataset): path = dataset["url"] provider = self.get_dataset_provider(path) schema_dict = provider.get_schema(get_fs(path), path) if "summary" in dataset: schema_dict["summary"] = dataset["summary"] if "markers" in schema_dict: schema_dict["markers_read_only"] = schema_dict.pop("markers") return schema_dict
def get_dataset_info(self, dataset): dataset_id = dataset["id"] if self.cached_dataset_id == dataset_id: dataset_info = self.cached_dataset_info else: path = dataset["url"] provider = self.get_dataset_provider(path) dataset_info = provider.get_dataset_info(get_fs(path), path) self.cached_dataset_info = dataset_info self.cached_dataset_id = dataset_id return dataset_info
def __init__(self, paths): super().__init__() self.dataset_to_info = {} # json_data, meta, json_path self.job_id_to_job = {} if os.environ.get(CIRRO_JOB_RESULTS) is not None: # load saved on disk fs = get_fs(os.environ[CIRRO_JOB_RESULTS]) fs.makedirs(os.environ[CIRRO_JOB_RESULTS], exist_ok=True) for url in fs.ls(os.environ[CIRRO_JOB_RESULTS]): if url.lower().endswith(".json.gz"): import gzip with gzip.open(fs.open(url)) as f: d = json.load(f) if "id" in d: d["url"] = url self.job_id_to_job[d["id"]] = d elif url.lower().endswith(".json"): with fs.open(url) as f: d = json.load(f) if "id" in d: d["url"] = url self.job_id_to_job[d["id"]] = d for path in paths: json_data = {} basename = os.path.splitext(path)[0] old_path = basename + "_filters.json" json_path = basename + ".json" if os.path.exists(old_path) and os.path.getsize(old_path) > 0: with open(old_path, "rt") as f: json_data["filters"] = json.load(f) if os.path.exists(json_path) and os.path.getsize(json_path) > 0: with open(json_path, "rt") as f: try: json_data.update(json.load(f)) except: print("Unable to load {}".format(json_path)) meta = create_dataset_meta(path) if "filters" not in json_data: json_data["filters"] = {} if "views" not in json_data: json_data["views"] = {} if "categories" not in json_data: json_data["categories"] = {} self.dataset_to_info[path] = dict(json_data=json_data, meta=meta, json_path=json_path)
def __init__(self, paths): super().__init__() self.dataset_to_info = {} # json_data, meta, json_path self.job_id_to_job = {} if os.environ.get(CIRRO_JOB_RESULTS) is not None: # load saved on disk fs = get_fs(os.environ[CIRRO_JOB_RESULTS]) fs.makedirs(os.environ[CIRRO_JOB_RESULTS], exist_ok=True) for url in fs.ls(os.environ[CIRRO_JOB_RESULTS]): if url.lower().endswith('.json.gz'): import gzip with gzip.open(fs.open(url)) as f: d = json.load(f) if 'id' in d: d['url'] = url self.job_id_to_job[d['id']] = d elif url.lower().endswith('.json'): with fs.open(url) as f: d = json.load(f) if 'id' in d: d['url'] = url self.job_id_to_job[d['id']] = d for path in paths: json_data = {} basename = os.path.splitext(path)[0] old_path = basename + '_filters.json' json_path = basename + '.json' if os.path.exists(old_path) and os.path.getsize(old_path) > 0: with open(old_path, 'rt') as f: json_data['filters'] = json.load(f) if os.path.exists(json_path) and os.path.getsize(json_path) > 0: with open(json_path, 'rt') as f: try: json_data.update(json.load(f)) except: print('Unable to load {}'.format(json_path)) meta = create_dataset_meta(path) if 'filters' not in json_data: json_data['filters'] = {} if 'views' not in json_data: json_data['views'] = {} if 'categories' not in json_data: json_data['categories'] = {} self.dataset_to_info[path] = dict(json_data=json_data, meta=meta, json_path=json_path)
def configure_app(app, list_of_dataset_paths, spatial_directories, marker_paths): from cirrocumulus.api import dataset_api from cirrocumulus.no_auth import NoAuth try: from cirrocumulus.parquet_dataset import ParquetDataset dataset_api.add(ParquetDataset()) except ModuleNotFoundError: pass try: from cirrocumulus.zarr_dataset import ZarrDataset dataset_api.add(ZarrDataset()) except ModuleNotFoundError: pass app.config[CIRRO_AUTH] = NoAuth() os.environ[CIRRO_JOB_TYPE + "de"] = "cirrocumulus.job_api.run_de" anndata_dataset = AnndataDataset() dataset_ids = [] for dataset_paths in list_of_dataset_paths: dataset_paths = dataset_paths.split(",") dataset_id = dataset_paths[0] dataset_ids.append(dataset_id) if len(dataset_paths) > 1: datasets = [] for i in range(len(dataset_paths)): dataset = anndata_dataset.get_data(get_fs(dataset_paths[i]), dataset_paths[i]) if "group" not in dataset.var: dataset.var["group"] = dataset.uns.get( "name", "dataset {}".format(i + 1)) datasets.append(dataset) adata = anndata.concat(datasets, axis=1, label="group", merge="unique") dataset.obsm = datasets[0].obsm adata.var.index = adata.var.index.str.replace("/", "_") adata.var_names_make_unique() anndata_dataset.add_data(dataset_id, adata) dataset_api.add(anndata_dataset) app.config[CIRRO_DATABASE] = LocalDbAPI(dataset_ids) if spatial_directories is not None and len(spatial_directories) > 0: for i in range(len(spatial_directories)): spatial_directory = spatial_directories[i] if spatial_directory != "": adata = anndata_dataset.get_data(get_fs(dataset_ids[i]), dataset_ids[i]) if not add_spatial(adata, spatial_directory): print("No spatial data found in {}".format( spatial_directory)) if marker_paths is not None and len(marker_paths) > 0: markers = get_markers(marker_paths) for dataset_id in dataset_ids: d = anndata_dataset.get_data(get_fs(dataset_id), dataset_id) existing_markers = d.uns.get("markers", []) markers += existing_markers # remove genes in dict that are not in dataset d.uns["markers"] = filter_markers(d, markers)
def main(argsv): import argparse parser = argparse.ArgumentParser(description="Run cirrocumulus") parser.add_argument( "dataset", help= "Path to dataset in h5ad, loom, Seurat, TileDB, zarr, or STAR-Fusion format. Separate multiple datasets with " "a comma instead of a space in order to join datasets by cell id", nargs="+", ) parser.add_argument("--spatial", help=SPATIAL_HELP, nargs="*") parser.add_argument( "--markers", help= 'Path to JSON file that maps name to features. For example {"a":["gene1", "gene2"], "b":["gene3"]}', nargs="*", ) parser.add_argument( "--host", help="Host IP address" ) # set to 0.0.0.0 to make it accessible from other computers WITHOUT SECURITY. parser.add_argument("--port", help="Server port", default=5000, type=int) parser.add_argument("--no-open", dest="no_open", help="Do not open your web browser", action="store_true") parser.add_argument( "--results", help="URL to save user computed results (e.g. differential expression)" ) parser.add_argument("--ontology", help="Path to ontology in OBO format for annotation") args = parser.parse_args(argsv) if args.results is not None: os.environ[CIRRO_JOB_RESULTS] = args.results else: os.environ[CIRRO_JOB_RESULTS] = os.path.join( os.path.dirname(args.dataset[0].rstrip("/")), "results") get_fs(os.environ[CIRRO_JOB_RESULTS]).makedirs( os.environ[CIRRO_JOB_RESULTS], exist_ok=True) if args.ontology is not None: os.environ[CIRRO_CELL_ONTOLOGY] = args.ontology app = create_app() configure_app(app, args.dataset, args.spatial, args.markers) if not args.no_open: import webbrowser import requests host = args.host if args.host is not None else "http://127.0.0.1" url = host + ":" + str(args.port) try: if requests.get(url).ok: import sys sys.exit("Address already in use") except: pass webbrowser.open(url) from flask import cli cli.show_server_banner = lambda *_: None # suppress warning message app.run(host=args.host, port=args.port, debug=False)
def main(argsv): import argparse import os parser = argparse.ArgumentParser(description='Run cirrocumulus server') parser.add_argument('--db_uri', help='Database connection URI', default=DEFAULT_DB_URI) parser.add_argument('-w', '--workers', dest='workers', help='The number of worker processes', type=int) parser.add_argument( '-t', '--timeout', dest='timeout', help= 'Workers silent for more than this many seconds are killed and restarted', type=int, default=30) parser.add_argument( '-b', '--bind', dest='bind', help= 'Server socket to bind. Server sockets can be any of $(HOST), $(HOST):$(PORT), fd://$(FD), or unix:$(PATH). An IP is a valid $(HOST).', default='127.0.0.1:5000') parser.add_argument( '--footer', help='Markdown file to customize the application footer') parser.add_argument( '--header', help='Markdown file to customize the application header') parser.add_argument('--upload', help='URL to allow users to upload files') parser.add_argument( '--results', help= 'URL to save user computed results (e.g. differential expression) to') parser.add_argument('--ontology', help='Path to ontology in OBO format for annotation') args = parser.parse_args(argsv) bind = args.bind if args.bind is not None else '127.0.0.1:5000' if args.ontology is not None: os.environ[CIRRO_CELL_ONTOLOGY] = args.ontology os.environ[CIRRO_DB_URI] = args.db_uri if args.footer is not None: os.environ[CIRRO_FOOTER] = args.footer if args.header is not None: os.environ[CIRRO_BRAND] = args.header if args.workers is not None: workers = args.workers else: import os workers = 2 * os.cpu_count() if args.upload is not None: os.environ[CIRRO_UPLOAD] = args.upload if args.results is not None: os.environ[CIRRO_JOB_RESULTS] = args.results get_fs(os.environ[CIRRO_JOB_RESULTS]).makedirs( os.environ[CIRRO_JOB_RESULTS], exist_ok=True) run_args = [ 'gunicorn', '-b', bind, '-w', str(workers), '-t', str(args.timeout), '-n', 'cirrocumulus-webserver', 'cirrocumulus.serve:cached_app()' ] # if args.gunicorn is not None: # run_args += args.gunicorn.split(' ') import subprocess subprocess.check_call(run_args)
def main(argsv): import os import argparse parser = argparse.ArgumentParser(description="Run cirrocumulus server") parser.add_argument("--db_uri", help="Database connection URI", default=DEFAULT_DB_URI) parser.add_argument("-w", "--workers", dest="workers", help="The number of worker processes", type=int) parser.add_argument( "-t", "--timeout", dest="timeout", help= "Workers silent for more than this many seconds are killed and restarted", type=int, default=30, ) parser.add_argument( "-b", "--bind", dest="bind", help= "Server socket to bind. Server sockets can be any of $(HOST), $(HOST):$(PORT), fd://$(FD), or unix:$(PATH). An IP is a valid $(HOST).", default="127.0.0.1:5000", ) parser.add_argument( "--footer", help="Markdown file to customize the application footer") parser.add_argument( "--header", help="Markdown file to customize the application header") parser.add_argument("--upload", help="URL to allow users to upload files") parser.add_argument( "--results", help= "URL to save user computed results (e.g. differential expression) to") parser.add_argument("--ontology", help="Path to ontology in OBO format for annotation") args = parser.parse_args(argsv) bind = args.bind if args.bind is not None else "127.0.0.1:5000" if args.ontology is not None: os.environ[CIRRO_CELL_ONTOLOGY] = args.ontology os.environ[CIRRO_DB_URI] = args.db_uri if args.footer is not None: os.environ[CIRRO_FOOTER] = args.footer if args.header is not None: os.environ[CIRRO_BRAND] = args.header if args.workers is not None: workers = args.workers else: import os workers = 2 * os.cpu_count() if args.upload is not None: os.environ[CIRRO_UPLOAD] = args.upload if args.results is not None: os.environ[CIRRO_JOB_RESULTS] = args.results get_fs(os.environ[CIRRO_JOB_RESULTS]).makedirs( os.environ[CIRRO_JOB_RESULTS], exist_ok=True) run_args = [ "gunicorn", "-b", bind, "-w", str(workers), "-t", str(args.timeout), "-n", "cirrocumulus-webserver", "cirrocumulus.serve:cached_app()", ] # if args.gunicorn is not None: # run_args += args.gunicorn.split(' ') import subprocess subprocess.check_call(run_args)
def execute(self): output_format = self.output_format dataset = self.dataset if self.groups is None and not self.no_auto_groups: groups = [] existing_fields = set() scanpy_marker_keys = get_scanpy_marker_keys(dataset) for key in scanpy_marker_keys: group_by = dataset.uns[key]['params']['groupby'] if isinstance(group_by, np.ndarray): group_by = ','.join(group_by) existing_fields.add(group_by) for field in dataset.obs.columns: field_lc = field.lower() for cluster_field in cluster_fields: if field_lc.find( cluster_field ) != -1 and cluster_field not in existing_fields: groups.append(field) break self.groups = groups if self.groups is not None and len(self.groups) > 0: use_pegasus = False use_scanpy = False try: import pegasus as pg use_pegasus = True except ModuleNotFoundError: pass if not use_pegasus: try: import scanpy as sc use_scanpy = True except ModuleNotFoundError: pass if not use_pegasus and not use_scanpy: raise ValueError( 'Please install pegasuspy or scanpy to compute markers') for group in self.groups: field = group if group not in dataset.obs: # test if multiple comma separated fields split_groups = group.split(',') if len(split_groups) > 1: use_split_groups = True for split_group in split_groups: if split_group not in dataset.obs: use_split_groups = False break if use_split_groups: dataset.obs[field] = dataset.obs[ split_groups[0]].str.cat( dataset.obs[split_groups[1:]], sep=',') if field in dataset.obs: if not pd.api.types.is_categorical_dtype( dataset.obs[field]): dataset.obs[field] = dataset.obs[field].astype( str).astype('category') if len(dataset.obs[field].cat.categories) > 1: logger.info('Computing markers for {}'.format(field)) key_added = 'rank_genes_' + str(field) if use_pegasus: pg.de_analysis(dataset, cluster=field, de_key=key_added) else: sc.tl.rank_genes_groups(dataset, field, key_added=key_added, method='t-test') else: logger.info(group + ' not found in ' + ', '.join(dataset.obs.columns)) schema = self.get_schema() schema['format'] = output_format if output_format in ['parquet', 'zarr']: output_dir = self.base_output else: output_dir = os.path.splitext(self.base_output)[0] filesystem = get_fs(output_dir) filesystem.makedirs(output_dir, exist_ok=True) results = schema.get('results', []) if len(results) > 0: uns_dir = os.path.join(output_dir, 'uns') is_gzip = output_format != 'jsonl' filesystem.makedirs(uns_dir, exist_ok=True) for i in range(len(results)): full_result = results[i] result_id = full_result.pop('id') # keep id, name, type in schema, store rest externally results[i] = dict(id=result_id, name=full_result.pop('name'), type=full_result.pop('type'), content_type='application/json', content_encoding='gzip' if is_gzip else None) json_result = to_json(full_result) result_path = os.path.join( uns_dir, result_id + '.json.gz') if is_gzip else os.path.join( uns_dir, result_id + '.json') with open_file(result_path, 'wt', compression='gzip' if is_gzip else None) as out: out.write(json_result) images = dataset.uns.get('images') if images is not None: image_dir = os.path.join(output_dir, 'images') filesystem.makedirs(image_dir, exist_ok=True) for image in images: src = image['image'] dest = os.path.join(image_dir, os.path.basename(src)) filesystem.copy(src, dest) image['image'] = 'images/' + os.path.basename(src) if output_format == 'parquet': from cirrocumulus.parquet_output import save_dataset_pq save_dataset_pq(dataset, schema, self.base_output, filesystem, self.save_whitelist) elif output_format == 'jsonl': from cirrocumulus.jsonl_io import save_dataset_jsonl save_dataset_jsonl(dataset, schema, output_dir, self.base_output, filesystem) elif output_format == 'zarr': from cirrocumulus.zarr_output import save_dataset_zarr save_dataset_zarr(dataset, schema, self.base_output, filesystem, self.save_whitelist) else: raise ValueError("Unknown format")
def execute(self): output_format = self.output_format dataset = self.dataset if self.groups is None and not self.no_auto_groups: groups = [] existing_fields = set() scanpy_marker_keys = get_scanpy_marker_keys(dataset) for key in scanpy_marker_keys: group_by = dataset.uns[key]["params"]["groupby"] if isinstance(group_by, np.ndarray): group_by = ",".join(group_by) existing_fields.add(group_by) for field in dataset.obs.columns: field_lc = field.lower() for cluster_field in cluster_fields: if field_lc.find(cluster_field) != -1 and cluster_field not in existing_fields: groups.append(field) break self.groups = groups if self.groups is not None and len(self.groups) > 0: use_pegasus = False use_scanpy = False try: import pegasus as pg use_pegasus = True except ModuleNotFoundError: pass if not use_pegasus: try: import scanpy as sc use_scanpy = True if "log1p" not in dataset.uns: dataset.uns["log1p"] = {} if "base" not in dataset.uns["log1p"]: dataset.uns["log1p"]["base"] = None except ModuleNotFoundError: pass if not use_pegasus and not use_scanpy: raise ValueError("Please install pegasuspy or scanpy to compute markers") first_time = True for group in self.groups: field = group if group not in dataset.obs: # test if multiple comma separated fields split_groups = group.split(",") if len(split_groups) > 1: use_split_groups = True for split_group in split_groups: if split_group not in dataset.obs: use_split_groups = False break if use_split_groups: dataset.obs[field] = dataset.obs[split_groups[0]].str.cat( dataset.obs[split_groups[1:]], sep="," ) if field in dataset.obs: if not pd.api.types.is_categorical_dtype(dataset.obs[field]): dataset.obs[field] = dataset.obs[field].astype(str).astype("category") if len(dataset.obs[field].cat.categories) > 1: key_added = "rank_genes_" + str(field) value_counts = dataset.obs[field].value_counts() filtered_value_counts = value_counts[value_counts >= 3] if len(filtered_value_counts) >= 2: if first_time: logger.info( "Using {} to compute markers".format( "pegasuspy" if use_pegasus else "scanpy" ) ) first_time = False logger.info("Computing markers for {}".format(field)) if use_pegasus: pg.de_analysis( dataset, cluster=field, de_key=key_added, subset=filtered_value_counts.index.to_list(), ) else: sc.tl.rank_genes_groups( dataset, field, key_added=key_added, method="t-test", groups=filtered_value_counts.index.to_list(), ) else: raise ValueError(group + " not found in " + ", ".join(dataset.obs.columns)) schema = self.get_schema() schema["format"] = output_format if output_format in ["parquet", "zarr"]: output_dir = self.base_output else: output_dir = os.path.splitext(self.base_output)[0] filesystem = get_fs(output_dir) filesystem.makedirs(output_dir, exist_ok=True) results = schema.get("results", []) if len(results) > 0: uns_dir = os.path.join(output_dir, "uns") is_gzip = output_format != "jsonl" filesystem.makedirs(uns_dir, exist_ok=True) for i in range(len(results)): full_result = results[i] result_id = full_result.pop("id") # keep id, name, type in schema, store rest externally results[i] = dict( id=result_id, name=full_result.pop("name"), type=full_result.pop("type"), content_type="application/json", content_encoding="gzip" if is_gzip else None, ) json_result = to_json(full_result) result_path = ( os.path.join(uns_dir, result_id + ".json.gz") if is_gzip else os.path.join(uns_dir, result_id + ".json") ) with open_file(result_path, "wt", compression="gzip" if is_gzip else None) as out: out.write(json_result) images = dataset.uns.pop("images", None) if images is not None: image_dir = os.path.join(output_dir, "images") filesystem.makedirs(image_dir, exist_ok=True) for image in images: src = image["image"] dest = os.path.join(image_dir, os.path.basename(src)) filesystem.copy(src, dest) image["image"] = "images/" + os.path.basename(src) if output_format == "parquet": from cirrocumulus.parquet_output import save_dataset_pq save_dataset_pq(dataset, schema, self.base_output, filesystem, self.save_whitelist) elif output_format == "jsonl": from cirrocumulus.jsonl_io import save_dataset_jsonl save_dataset_jsonl(dataset, schema, output_dir, self.base_output, filesystem) elif output_format == "zarr": from cirrocumulus.zarr_output import save_dataset_zarr save_dataset_zarr(dataset, schema, self.base_output, filesystem, self.save_whitelist) else: raise ValueError("Unknown format")