def _load_data(self, data_locator): # as of AnnData 0.6.19, backed mode performs initial load fast, but at the # cost of significantly slower access to X data. try: # there is no guarantee data_locator indicates a local file. The AnnData # API will only consume local file objects. If we get a non-local object, # make a copy in tmp, and delete it after we load into memory. with data_locator.local_handle() as lh: # as of AnnData 0.6.19, backed mode performs initial load fast, but at the # cost of significantly slower access to X data. backed = 'r' if self.config['backed'] else None self.data = anndata.read_h5ad(lh, backed=backed) except ValueError: raise ScanpyFileError( "File must be in the .h5ad format. Please read " "https://github.com/theislab/scanpy_usage/blob/master/170505_seurat/info_h5ad.md to " "learn more about this format. You may be able to convert your file into this format " "using `cellxgene prepare`, please run `cellxgene prepare --help` for more " "information." ) except MemoryError: raise ScanpyFileError("Out of memory - file is too large for available memory.") except Exception as e: raise ScanpyFileError( f"{e} - file not found or is inaccessible. File must be an .h5ad object. " f"Please check your input and try again." )
def annotation_put_fbs(self, axis, fbs, uid=None, collection=None): if not self.config["annotations"]: raise DisabledFeatureError("Writable annotations are not enabled") fname = self.get_anno_fname(uid, collection) if not fname: raise ScanpyFileError("Writable annotations - unable to determine file name for annotations") if axis != Axis.OBS: raise ValueError("Only OBS dimension access is supported") new_label_df = decode_matrix_fbs(fbs) if not new_label_df.empty: new_label_df.index = self.original_obs_index self._validate_label_data(new_label_df) # paranoia # if any of the new column labels overlap with our existing labels, raise error duplicate_columns = list(set(new_label_df.columns) & set(self.data.obs.columns)) if not new_label_df.columns.is_unique or len(duplicate_columns) > 0: raise KeyError(f"Labels file may not contain column names which overlap " f"with h5ad obs columns {duplicate_columns}") # update our internal state and save it. Multi-threading often enabled, # so treat this as a critical section. with self.label_lock: lastmod = self.data_locator.lastmodtime() lastmodstr = "'unknown'" if lastmod is None else lastmod.isoformat(timespec="seconds") header = f"# Annotations generated on {datetime.now().isoformat(timespec='seconds')} " \ f"using cellxgene version {cellxgene_version}\n" \ f"# Input data file was {self.data_locator.uri_or_path}, " \ f"which was last modified on {lastmodstr}\n" write_labels(fname, new_label_df, header, backup_dir=self.get_anno_backup_dir(uid, collection)) return jsonify_scanpy({"status": "OK"})
def _load_data(data): # Based on benchmarking, cache=True has no impact on perf. # Note: as of current scanpy/anndata release, setting backed='r' will # result in an error. https://github.com/theislab/anndata/issues/79 try: result = sc.read(data, cache=True) except ValueError: raise ScanpyFileError( "File must be in the .h5ad format. Please read " "https://github.com/theislab/scanpy_usage/blob/master/170505_seurat/info_h5ad.md to " "learn more about this format. You may be able to convert your file into this format " "using `cellxgene prepare`, please run `cellxgene prepare --help` for more " "information.") except Exception as e: raise ScanpyFileError( f"Error while loading file: {e}, File must be in the .h5ad format, please check " f"that your input and try again.") return result
def _load_data(self, data): # as of AnnData 0.6.19, backed mode performs initial load fast, but at the # cost of significantly slower access to X data. try: self.data = anndata.read_h5ad(data) except ValueError: raise ScanpyFileError( "File must be in the .h5ad format. Please read " "https://github.com/theislab/scanpy_usage/blob/master/170505_seurat/info_h5ad.md to " "learn more about this format. You may be able to convert your file into this format " "using `cellxgene prepare`, please run `cellxgene prepare --help` for more " "information.") except MemoryError: raise ScanpyFileError( "Error while loading file: out of memory, file is too large" " for memory available") except Exception as e: raise ScanpyFileError( f"Error while loading file: {e}, File must be in the .h5ad format, please check " f"that your input and try again.")
def annotation_to_fbs_matrix(self, axis, fields=None, uid=None, collection=None): if axis == Axis.OBS: if self.config["annotations"]: try: labels = read_labels(self.get_anno_fname(uid, collection)) except Exception as e: raise ScanpyFileError( f"Error while loading label file: {e}, File must be in the .csv format, please check " f"your input and try again." ) else: labels = None if labels is not None and not labels.empty: df = self.data.obs.join(labels, self.config['obs_names']) else: df = self.data.obs else: df = self.data.var if fields is not None and len(fields) > 0: df = df[fields] return encode_matrix_fbs(df, col_idx=df.columns)