def load_dataset(self, path, extra_server_config={}, extra_dataset_config={}): config = app_config(path, extra_server_config=extra_server_config, extra_dataset_config=extra_dataset_config) loader = MatrixDataLoader(path) adaptor = loader.open(config) return adaptor
def get_data_adaptor(self): server_config = self.app_config.server_config if not server_config.data_adaptor: matrix_data_loader = MatrixDataLoader( server_config.single_dataset__datapath, app_config=self.app_config) server_config.data_adaptor = matrix_data_loader.open( self.app_config) return server_config.data_adaptor
def handle_single_dataset(self, context): self.check_attr("single_dataset__datapath", (str, type(None))) self.check_attr("single_dataset__title", (str, type(None))) self.check_attr("single_dataset__about", (str, type(None))) self.check_attr("single_dataset__obs_names", (str, type(None))) self.check_attr("single_dataset__var_names", (str, type(None))) if self.single_dataset__datapath is None: if self.multi_dataset__dataroot is None: # TODO: change the error message once dataroot is fully supported raise ConfigurationError("missing datapath") return else: if self.multi_dataset__dataroot is not None: raise ConfigurationError( "must supply only one of datapath or dataroot") # create the matrix data cache manager: if self.matrix_data_cache_manager is None: self.matrix_data_cache_manager = MatrixDataCacheManager( max_cached=1, timelimit_s=None) # preload this data set matrix_data_loader = MatrixDataLoader(self.single_dataset__datapath, app_config=self.app_config) try: matrix_data_loader.pre_load_validation() except DatasetAccessError as e: raise ConfigurationError(str(e)) file_size = matrix_data_loader.file_size() file_basename = basename(self.single_dataset__datapath) if file_size > BIG_FILE_SIZE_THRESHOLD: context["messagefn"]( f"Loading data from {file_basename}, this may take a while...") else: context["messagefn"](f"Loading data from {file_basename}.") if self.single_dataset__about: def url_check(url): try: result = urlparse(url) if all([result.scheme, result.netloc]): return True else: return False except ValueError: return False if not url_check(self.single_dataset__about): raise ConfigurationError( "Must provide an absolute URL for --about. (Example format: http://example.com)" )
def data_with_tmp_annotations(ext: MatrixDataType, annotations_fixture=False): tmp_dir = tempfile.mkdtemp() annotations_file = path.join(tmp_dir, "test_annotations.csv") if annotations_fixture: shutil.copyfile( f"{PROJECT_ROOT}/server/test/test_datasets/pbmc3k-annotations.csv", annotations_file) args = { "embeddings__names": ["umap"], "presentation__max_categories": 100, "single_dataset__obs_names": None, "single_dataset__var_names": None, "diffexp__lfc_cutoff": 0.01, } fname = { MatrixDataType.H5AD: f"{PROJECT_ROOT}/example-dataset/pbmc3k.h5ad", MatrixDataType.CXG: "test/test_datasets/pbmc3k.cxg", }[ext] data_locator = DataLocator(fname) config = AppConfig() config.update(**args) config.update(single_dataset__datapath=data_locator.path) config.complete_config() data = MatrixDataLoader(data_locator.abspath()).open(config) annotations = AnnotationsLocalFile(None, annotations_file) return data, tmp_dir, annotations
def data_with_tmp_annotations(ext: MatrixDataType, annotations_fixture=False): tmp_dir = tempfile.mkdtemp() annotations_file = path.join(tmp_dir, "test_annotations.csv") if annotations_fixture: shutil.copyfile( f"{PROJECT_ROOT}/server/test/fixtures/pbmc3k-annotations.csv", annotations_file) fname = { MatrixDataType.H5AD: f"{PROJECT_ROOT}/example-dataset/pbmc3k.h5ad", MatrixDataType.CXG: "test/fixtures/pbmc3k.cxg", }[ext] data_locator = DataLocator(fname) config = AppConfig() config.update_server_config( app__flask_secret_key="secret", single_dataset__obs_names=None, single_dataset__var_names=None, single_dataset__datapath=data_locator.path, ) config.update_default_dataset_config( embeddings__names=["umap"], presentation__max_categories=100, diffexp__lfc_cutoff=0.01, ) config.complete_config() data = MatrixDataLoader(data_locator.abspath()).open(config) annotations = AnnotationsLocalFile(None, annotations_file) return data, tmp_dir, annotations
def data_with_tmp_tiledb_annotations(ext: MatrixDataType): tmp_dir = tempfile.mkdtemp() fname = { MatrixDataType.H5AD: f"{PROJECT_ROOT}/example-dataset/pbmc3k.h5ad", MatrixDataType.CXG: "test/fixtures/pbmc3k.cxg", }[ext] data_locator = DataLocator(fname) config = AppConfig() config.update_server_config( app__flask_secret_key="secret", multi_dataset__dataroot=data_locator.path, authentication__type="test", authentication__insecure_test_environment=True, ) config.update_default_dataset_config( embeddings__names=["umap"], presentation__max_categories=100, diffexp__lfc_cutoff=0.01, user_annotations__type="hosted_tiledb_array", user_annotations__hosted_tiledb_array__db_uri= "postgresql://*****:*****@localhost:5432", user_annotations__hosted_tiledb_array__hosted_file_directory=tmp_dir, ) config.complete_config() data = MatrixDataLoader(data_locator.abspath()).open(config) annotations = AnnotationsHostedTileDB( tmp_dir, DbUtils("postgresql://*****:*****@localhost:5432"), ) return data, tmp_dir, annotations
def dataroot_test_index(): # the following index page is meant for testing/debugging purposes data = '<!doctype html><html lang="en">' data += "<head><title>Hosted Cellxgene</title></head>" data += "<body><H1>Welcome to cellxgene</H1>" config = current_app.app_config server_config = config.server_config datasets = [] for dataroot_dict in server_config.multi_dataset__dataroot.values(): dataroot = dataroot_dict["dataroot"] url_dataroot = dataroot_dict["base_url"] locator = DataLocator( dataroot, region_name=server_config.data_locator__s3__region_name) for fname in locator.ls(): location = path_join(dataroot, fname) try: MatrixDataLoader(location, app_config=config) datasets.append((url_dataroot, fname)) except DatasetAccessError: # skip over invalid datasets pass data += "<br/>Select one of these datasets...<br/>" data += "<ul>" datasets.sort() for url_dataroot, dataset in datasets: data += f"<li><a href={url_dataroot}/{dataset}>{dataset}</a></li>" data += "</ul>" data += "</body></html>" return make_response(data)
def handle_embeddings(self, context): self.check_attr("embeddings__names", list) self.check_attr("embeddings__enable_reembedding", bool) if self.app_config.server_config.single_dataset__datapath: if self.embeddings__enable_reembedding: matrix_data_loader = MatrixDataLoader( self.single_dataset__datapath, app_config=self.app_config) if matrix_data_loader.matrix_data_type( ) != MatrixDataType.H5AD: raise ConfigurationError( "'enable-reembedding is only supported with H5AD files." ) if self.adaptor__anndata_adaptor__backed: raise ConfigurationError( "enable-reembedding is not supported when run in --backed mode." )
def handle_single_dataset(self, context): self.validate_correct_type_of_configuration_attribute( "single_dataset__datapath", (str, type(None))) self.validate_correct_type_of_configuration_attribute( "single_dataset__title", (str, type(None))) self.validate_correct_type_of_configuration_attribute( "single_dataset__about", (str, type(None))) self.validate_correct_type_of_configuration_attribute( "single_dataset__obs_names", (str, type(None))) self.validate_correct_type_of_configuration_attribute( "single_dataset__var_names", (str, type(None))) # preload this data set matrix_data_loader = MatrixDataLoader(self.single_dataset__datapath, app_config=self.app_config) try: matrix_data_loader.pre_load_validation() except DatasetAccessError as e: raise ConfigurationError(str(e)) file_size = matrix_data_loader.file_size() file_basename = basename(self.single_dataset__datapath) if file_size > BIG_FILE_SIZE_THRESHOLD: context["messagefn"]( f"Loading data from {file_basename}, this may take a while...") else: context["messagefn"](f"Loading data from {file_basename}.") if self.single_dataset__about: def url_check(url): try: result = urlparse(url) if all([result.scheme, result.netloc]): return True else: return False except ValueError: return False if not url_check(self.single_dataset__about): raise ConfigurationError( "Must provide an absolute URL for --about. (Example format: http://example.com)" )
def main(): parser = argparse.ArgumentParser("A command to test diffexp") parser.add_argument("dataset", help="name of a dataset to load") parser.add_argument("-na", "--numA", type=int, help="number of rows in group A") parser.add_argument("-nb", "--numB", type=int, help="number of rows in group B") parser.add_argument("-va", "--varA", help="obs variable:value to use for group A") parser.add_argument("-vb", "--varB", help="obs variable:value to use for group B") parser.add_argument("-t", "--trials", default=1, type=int, help="number of trials") parser.add_argument("-a", "--alg", choices=("default", "generic"), default="default", help="algorithm to use") parser.add_argument("-s", "--show", default=False, action="store_true", help="show the results") parser.add_argument("-n", "--new-selection", default=False, action="store_true", help="change the selection between each trial") parser.add_argument("--seed", default=1, type=int, help="set the random seed") args = parser.parse_args() app_config = AppConfig() app_config.update_server_config(single_dataset__datapath=args.dataset) app_config.update_server_config(app__verbose=True) app_config.complete_config() loader = MatrixDataLoader(args.dataset) adaptor = loader.open(app_config) random.seed(args.seed) np.random.seed(args.seed) rows = adaptor.get_shape()[0] if args.numA: filterA = random.sample(range(rows), args.numA) elif args.varA: vname, vval = args.varA.split(":") filterA = get_filter_from_obs(adaptor, vname, vval) else: print("must supply numA or varA") sys.exit(1) if args.numB: filterB = random.sample(range(rows), args.numB) elif args.varB: vname, vval = args.varB.split(":") filterB = get_filter_from_obs(adaptor, vname, vval) else: print("must supply numB or varB") sys.exit(1) for i in range(args.trials): if args.new_selection: if args.numA: filterA = random.sample(range(rows), args.numA) if args.numB: filterB = random.sample(range(rows), args.numB) maskA = np.zeros(rows, dtype=bool) maskA[filterA] = True maskB = np.zeros(rows, dtype=bool) maskB[filterB] = True t1 = time.time() if args.alg == "default": results = adaptor.compute_diffexp_ttest(maskA, maskB) elif args.alg == "generic": results = diffexp_generic.diffexp_ttest(adaptor, maskA, maskB) t2 = time.time() print("TIME=", t2 - t1) if args.show: for res in results: print(res)
def main(): parser = argparse.ArgumentParser("A command to test diffexp") parser.add_argument("dataset", help="name of a dataset to load") parser.add_argument("-na", "--numA", type=int, required=True, help="number of rows in group A") parser.add_argument("-nb", "--numB", type=int, required=True, help="number of rows in group B") parser.add_argument("-t", "--trials", default=1, type=int, help="number of trials") parser.add_argument("-a", "--alg", choices=("default", "generic", "cxg"), default="default", help="algorithm to use") parser.add_argument("-s", "--show", default=False, action="store_true", help="show the results") parser.add_argument("-n", "--new-selection", default=False, action="store_true", help="change the selection between each trial") parser.add_argument("--seed", default=1, type=int, help="set the random seed") args = parser.parse_args() app_config = AppConfig() app_config.single_dataset__datapath = args.dataset app_config.server__verbose = True app_config.complete_config() loader = MatrixDataLoader(args.dataset) adaptor = loader.open(app_config) if args.show: if isinstance(adaptor, CxgAdaptor): adaptor.open_array("X").schema.dump() numA = args.numA numB = args.numB rows = adaptor.get_shape()[0] random.seed(args.seed) if not args.new_selection: samples = random.sample(range(rows), numA + numB) filterA = samples[:numA] filterB = samples[numA:] for i in range(args.trials): if args.new_selection: samples = random.sample(range(rows), numA + numB) filterA = samples[:numA] filterB = samples[numA:] maskA = np.zeros(rows, dtype=bool) maskA[filterA] = True maskB = np.zeros(rows, dtype=bool) maskB[filterB] = True t1 = time.time() if args.alg == "default": results = adaptor.compute_diffexp_ttest(maskA, maskB) elif args.alg == "generic": results = diffexp_generic.diffexp_ttest(adaptor, maskA, maskB) elif args.alg == "cxg": if not isinstance(adaptor, CxgAdaptor): print("cxg only works with CxgAdaptor") sys.exit(1) results = diffexp_cxg.diffexp_ttest(adaptor, maskA, maskB) t2 = time.time() print("TIME=", t2 - t1) if args.show: for res in results: print(res)