Example #1
0
 def load_dataset(self,
                  path,
                  extra_server_config={},
                  extra_dataset_config={}):
     config = app_config(path,
                         extra_server_config=extra_server_config,
                         extra_dataset_config=extra_dataset_config)
     loader = MatrixDataLoader(path)
     adaptor = loader.open(config)
     return adaptor
    def get_data_adaptor(self):
        server_config = self.app_config.server_config
        if not server_config.data_adaptor:
            matrix_data_loader = MatrixDataLoader(
                server_config.single_dataset__datapath,
                app_config=self.app_config)
            server_config.data_adaptor = matrix_data_loader.open(
                self.app_config)

        return server_config.data_adaptor
Example #3
0
    def handle_single_dataset(self, context):
        self.check_attr("single_dataset__datapath", (str, type(None)))
        self.check_attr("single_dataset__title", (str, type(None)))
        self.check_attr("single_dataset__about", (str, type(None)))
        self.check_attr("single_dataset__obs_names", (str, type(None)))
        self.check_attr("single_dataset__var_names", (str, type(None)))

        if self.single_dataset__datapath is None:
            if self.multi_dataset__dataroot is None:
                # TODO:  change the error message once dataroot is fully supported
                raise ConfigurationError("missing datapath")
            return
        else:
            if self.multi_dataset__dataroot is not None:
                raise ConfigurationError(
                    "must supply only one of datapath or dataroot")

        # create the matrix data cache manager:
        if self.matrix_data_cache_manager is None:
            self.matrix_data_cache_manager = MatrixDataCacheManager(
                max_cached=1, timelimit_s=None)

        # preload this data set
        matrix_data_loader = MatrixDataLoader(self.single_dataset__datapath,
                                              app_config=self.app_config)
        try:
            matrix_data_loader.pre_load_validation()
        except DatasetAccessError as e:
            raise ConfigurationError(str(e))

        file_size = matrix_data_loader.file_size()
        file_basename = basename(self.single_dataset__datapath)
        if file_size > BIG_FILE_SIZE_THRESHOLD:
            context["messagefn"](
                f"Loading data from {file_basename}, this may take a while...")
        else:
            context["messagefn"](f"Loading data from {file_basename}.")

        if self.single_dataset__about:

            def url_check(url):
                try:
                    result = urlparse(url)
                    if all([result.scheme, result.netloc]):
                        return True
                    else:
                        return False
                except ValueError:
                    return False

            if not url_check(self.single_dataset__about):
                raise ConfigurationError(
                    "Must provide an absolute URL for --about. (Example format: http://example.com)"
                )
Example #4
0
def data_with_tmp_annotations(ext: MatrixDataType, annotations_fixture=False):
    tmp_dir = tempfile.mkdtemp()
    annotations_file = path.join(tmp_dir, "test_annotations.csv")
    if annotations_fixture:
        shutil.copyfile(
            f"{PROJECT_ROOT}/server/test/test_datasets/pbmc3k-annotations.csv",
            annotations_file)
    args = {
        "embeddings__names": ["umap"],
        "presentation__max_categories": 100,
        "single_dataset__obs_names": None,
        "single_dataset__var_names": None,
        "diffexp__lfc_cutoff": 0.01,
    }
    fname = {
        MatrixDataType.H5AD: f"{PROJECT_ROOT}/example-dataset/pbmc3k.h5ad",
        MatrixDataType.CXG: "test/test_datasets/pbmc3k.cxg",
    }[ext]
    data_locator = DataLocator(fname)
    config = AppConfig()
    config.update(**args)
    config.update(single_dataset__datapath=data_locator.path)
    config.complete_config()
    data = MatrixDataLoader(data_locator.abspath()).open(config)
    annotations = AnnotationsLocalFile(None, annotations_file)
    return data, tmp_dir, annotations
Example #5
0
def data_with_tmp_annotations(ext: MatrixDataType, annotations_fixture=False):
    tmp_dir = tempfile.mkdtemp()
    annotations_file = path.join(tmp_dir, "test_annotations.csv")
    if annotations_fixture:
        shutil.copyfile(
            f"{PROJECT_ROOT}/server/test/fixtures/pbmc3k-annotations.csv",
            annotations_file)
    fname = {
        MatrixDataType.H5AD: f"{PROJECT_ROOT}/example-dataset/pbmc3k.h5ad",
        MatrixDataType.CXG: "test/fixtures/pbmc3k.cxg",
    }[ext]
    data_locator = DataLocator(fname)
    config = AppConfig()
    config.update_server_config(
        app__flask_secret_key="secret",
        single_dataset__obs_names=None,
        single_dataset__var_names=None,
        single_dataset__datapath=data_locator.path,
    )
    config.update_default_dataset_config(
        embeddings__names=["umap"],
        presentation__max_categories=100,
        diffexp__lfc_cutoff=0.01,
    )

    config.complete_config()
    data = MatrixDataLoader(data_locator.abspath()).open(config)
    annotations = AnnotationsLocalFile(None, annotations_file)
    return data, tmp_dir, annotations
Example #6
0
def data_with_tmp_tiledb_annotations(ext: MatrixDataType):
    tmp_dir = tempfile.mkdtemp()
    fname = {
        MatrixDataType.H5AD: f"{PROJECT_ROOT}/example-dataset/pbmc3k.h5ad",
        MatrixDataType.CXG: "test/fixtures/pbmc3k.cxg",
    }[ext]
    data_locator = DataLocator(fname)
    config = AppConfig()
    config.update_server_config(
        app__flask_secret_key="secret",
        multi_dataset__dataroot=data_locator.path,
        authentication__type="test",
        authentication__insecure_test_environment=True,
    )
    config.update_default_dataset_config(
        embeddings__names=["umap"],
        presentation__max_categories=100,
        diffexp__lfc_cutoff=0.01,
        user_annotations__type="hosted_tiledb_array",
        user_annotations__hosted_tiledb_array__db_uri=
        "postgresql://*****:*****@localhost:5432",
        user_annotations__hosted_tiledb_array__hosted_file_directory=tmp_dir,
    )

    config.complete_config()

    data = MatrixDataLoader(data_locator.abspath()).open(config)
    annotations = AnnotationsHostedTileDB(
        tmp_dir,
        DbUtils("postgresql://*****:*****@localhost:5432"),
    )
    return data, tmp_dir, annotations
Example #7
0
def dataroot_test_index():
    # the following index page is meant for testing/debugging purposes
    data = '<!doctype html><html lang="en">'
    data += "<head><title>Hosted Cellxgene</title></head>"
    data += "<body><H1>Welcome to cellxgene</H1>"

    config = current_app.app_config
    server_config = config.server_config
    datasets = []
    for dataroot_dict in server_config.multi_dataset__dataroot.values():
        dataroot = dataroot_dict["dataroot"]
        url_dataroot = dataroot_dict["base_url"]
        locator = DataLocator(
            dataroot, region_name=server_config.data_locator__s3__region_name)
        for fname in locator.ls():
            location = path_join(dataroot, fname)
            try:
                MatrixDataLoader(location, app_config=config)
                datasets.append((url_dataroot, fname))
            except DatasetAccessError:
                # skip over invalid datasets
                pass

    data += "<br/>Select one of these datasets...<br/>"
    data += "<ul>"
    datasets.sort()
    for url_dataroot, dataset in datasets:
        data += f"<li><a href={url_dataroot}/{dataset}>{dataset}</a></li>"
    data += "</ul>"
    data += "</body></html>"

    return make_response(data)
Example #8
0
    def handle_embeddings(self, context):
        self.check_attr("embeddings__names", list)
        self.check_attr("embeddings__enable_reembedding", bool)

        if self.app_config.server_config.single_dataset__datapath:
            if self.embeddings__enable_reembedding:
                matrix_data_loader = MatrixDataLoader(
                    self.single_dataset__datapath, app_config=self.app_config)
                if matrix_data_loader.matrix_data_type(
                ) != MatrixDataType.H5AD:
                    raise ConfigurationError(
                        "'enable-reembedding is only supported with H5AD files."
                    )
                if self.adaptor__anndata_adaptor__backed:
                    raise ConfigurationError(
                        "enable-reembedding is not supported when run in --backed mode."
                    )
Example #9
0
    def handle_single_dataset(self, context):
        self.validate_correct_type_of_configuration_attribute(
            "single_dataset__datapath", (str, type(None)))
        self.validate_correct_type_of_configuration_attribute(
            "single_dataset__title", (str, type(None)))
        self.validate_correct_type_of_configuration_attribute(
            "single_dataset__about", (str, type(None)))
        self.validate_correct_type_of_configuration_attribute(
            "single_dataset__obs_names", (str, type(None)))
        self.validate_correct_type_of_configuration_attribute(
            "single_dataset__var_names", (str, type(None)))

        # preload this data set
        matrix_data_loader = MatrixDataLoader(self.single_dataset__datapath,
                                              app_config=self.app_config)
        try:
            matrix_data_loader.pre_load_validation()
        except DatasetAccessError as e:
            raise ConfigurationError(str(e))

        file_size = matrix_data_loader.file_size()
        file_basename = basename(self.single_dataset__datapath)
        if file_size > BIG_FILE_SIZE_THRESHOLD:
            context["messagefn"](
                f"Loading data from {file_basename}, this may take a while...")
        else:
            context["messagefn"](f"Loading data from {file_basename}.")

        if self.single_dataset__about:

            def url_check(url):
                try:
                    result = urlparse(url)
                    if all([result.scheme, result.netloc]):
                        return True
                    else:
                        return False
                except ValueError:
                    return False

            if not url_check(self.single_dataset__about):
                raise ConfigurationError(
                    "Must provide an absolute URL for --about. (Example format: http://example.com)"
                )
Example #10
0
def main():
    parser = argparse.ArgumentParser("A command to test diffexp")
    parser.add_argument("dataset", help="name of a dataset to load")
    parser.add_argument("-na",
                        "--numA",
                        type=int,
                        help="number of rows in group A")
    parser.add_argument("-nb",
                        "--numB",
                        type=int,
                        help="number of rows in group B")
    parser.add_argument("-va",
                        "--varA",
                        help="obs variable:value to use for group A")
    parser.add_argument("-vb",
                        "--varB",
                        help="obs variable:value to use for group B")
    parser.add_argument("-t",
                        "--trials",
                        default=1,
                        type=int,
                        help="number of trials")
    parser.add_argument("-a",
                        "--alg",
                        choices=("default", "generic"),
                        default="default",
                        help="algorithm to use")
    parser.add_argument("-s",
                        "--show",
                        default=False,
                        action="store_true",
                        help="show the results")
    parser.add_argument("-n",
                        "--new-selection",
                        default=False,
                        action="store_true",
                        help="change the selection between each trial")
    parser.add_argument("--seed",
                        default=1,
                        type=int,
                        help="set the random seed")

    args = parser.parse_args()

    app_config = AppConfig()
    app_config.update_server_config(single_dataset__datapath=args.dataset)
    app_config.update_server_config(app__verbose=True)
    app_config.complete_config()

    loader = MatrixDataLoader(args.dataset)
    adaptor = loader.open(app_config)

    random.seed(args.seed)
    np.random.seed(args.seed)
    rows = adaptor.get_shape()[0]

    if args.numA:
        filterA = random.sample(range(rows), args.numA)
    elif args.varA:
        vname, vval = args.varA.split(":")
        filterA = get_filter_from_obs(adaptor, vname, vval)
    else:
        print("must supply numA or varA")
        sys.exit(1)

    if args.numB:
        filterB = random.sample(range(rows), args.numB)
    elif args.varB:
        vname, vval = args.varB.split(":")
        filterB = get_filter_from_obs(adaptor, vname, vval)
    else:
        print("must supply numB or varB")
        sys.exit(1)

    for i in range(args.trials):
        if args.new_selection:
            if args.numA:
                filterA = random.sample(range(rows), args.numA)
            if args.numB:
                filterB = random.sample(range(rows), args.numB)

        maskA = np.zeros(rows, dtype=bool)
        maskA[filterA] = True
        maskB = np.zeros(rows, dtype=bool)
        maskB[filterB] = True

        t1 = time.time()
        if args.alg == "default":
            results = adaptor.compute_diffexp_ttest(maskA, maskB)
        elif args.alg == "generic":
            results = diffexp_generic.diffexp_ttest(adaptor, maskA, maskB)

        t2 = time.time()
        print("TIME=", t2 - t1)

    if args.show:
        for res in results:
            print(res)
Example #11
0
def main():
    parser = argparse.ArgumentParser("A command to test diffexp")
    parser.add_argument("dataset", help="name of a dataset to load")
    parser.add_argument("-na",
                        "--numA",
                        type=int,
                        required=True,
                        help="number of rows in group A")
    parser.add_argument("-nb",
                        "--numB",
                        type=int,
                        required=True,
                        help="number of rows in group B")
    parser.add_argument("-t",
                        "--trials",
                        default=1,
                        type=int,
                        help="number of trials")
    parser.add_argument("-a",
                        "--alg",
                        choices=("default", "generic", "cxg"),
                        default="default",
                        help="algorithm to use")
    parser.add_argument("-s",
                        "--show",
                        default=False,
                        action="store_true",
                        help="show the results")
    parser.add_argument("-n",
                        "--new-selection",
                        default=False,
                        action="store_true",
                        help="change the selection between each trial")
    parser.add_argument("--seed",
                        default=1,
                        type=int,
                        help="set the random seed")

    args = parser.parse_args()

    app_config = AppConfig()
    app_config.single_dataset__datapath = args.dataset
    app_config.server__verbose = True
    app_config.complete_config()

    loader = MatrixDataLoader(args.dataset)
    adaptor = loader.open(app_config)

    if args.show:
        if isinstance(adaptor, CxgAdaptor):
            adaptor.open_array("X").schema.dump()

    numA = args.numA
    numB = args.numB
    rows = adaptor.get_shape()[0]

    random.seed(args.seed)

    if not args.new_selection:
        samples = random.sample(range(rows), numA + numB)
        filterA = samples[:numA]
        filterB = samples[numA:]

    for i in range(args.trials):
        if args.new_selection:
            samples = random.sample(range(rows), numA + numB)
            filterA = samples[:numA]
            filterB = samples[numA:]

        maskA = np.zeros(rows, dtype=bool)
        maskA[filterA] = True
        maskB = np.zeros(rows, dtype=bool)
        maskB[filterB] = True

        t1 = time.time()
        if args.alg == "default":
            results = adaptor.compute_diffexp_ttest(maskA, maskB)
        elif args.alg == "generic":
            results = diffexp_generic.diffexp_ttest(adaptor, maskA, maskB)
        elif args.alg == "cxg":
            if not isinstance(adaptor, CxgAdaptor):
                print("cxg only works with CxgAdaptor")
                sys.exit(1)
            results = diffexp_cxg.diffexp_ttest(adaptor, maskA, maskB)

        t2 = time.time()
        print("TIME=", t2 - t1)

    if args.show:
        for res in results:
            print(res)