def load_file(filepath): if filepath == 'default' or filepath == 'datasets/user_uploaded/default': filepath = join_root("../datasets/default.csv") elif filepath == 'test': filepath = join_root('../../datasets/server/testdataset.h5ad') dataset = os.path.basename(filepath) dataset = os.path.splitext(dataset)[0] try: if filepath[-4:] == 'h5ad': adata = anndata.read_h5ad(filepath) if filepath[-3:] == 'csv': # TODO remove transpose adata = anndata.read_csv(filepath).T if filepath[-4:] == 'xlsx': adata = anndata.read_excel(filepath) if filepath[-3:] == 'mtx': adata = anndata.read_mtx(filepath) if filepath[-3:] == 'txt' or filepath[-3:] == 'tab' or filepath[ -4:] == 'data': adata = anndata.read_text(filepath) if filepath[-2:] == 'h5': adata = anndata.read_hdf(filepath) if filepath[-4:] == 'loom': adata = anndata.read_loom(filepath) except Exception as e: print(str(e)) raise IncorrectFileFormat( "File does not exist or file format is incorrect.") adata.uns['dataset'] = dataset return adata
def _read(filename, backed=False, sheet=None, ext=None, delimiter=None, first_column_names=None, backup_url=None, cache=False, suppress_cache_warning=False): if ext is not None and ext not in avail_exts: raise ValueError('Please provide one of the available extensions.\n' + avail_exts) else: ext = is_valid_filename(filename, return_ext=True) is_present = check_datafile_present_and_download(filename, backup_url=backup_url) if not is_present: logg.msg('... did not find original file', filename) # read hdf5 files if ext in {'h5', 'h5ad'}: if sheet is None: return read_h5ad(filename, backed=backed) else: logg.msg('reading sheet', sheet, 'from file', filename, v=4) return read_hdf(filename, sheet) # read other file types filename_cache = (settings.cachedir + filename.lstrip( './').replace('/', '-').replace('.' + ext, '.h5ad')) if cache and os.path.exists(filename_cache): logg.info('... reading from cache file', filename_cache) adata = read_h5ad(filename_cache, backed=False) else: if not is_present: raise FileNotFoundError('Did not find file {}.'.format(filename)) logg.msg('reading', filename, v=4) if not cache and not suppress_cache_warning: logg.hint('This might be very slow. Consider passing `cache=True`, ' 'which enables much faster reading from a cache file.') # do the actual reading if ext == 'xlsx' or ext == 'xls': if sheet is None: raise ValueError( 'Provide `sheet` parameter when reading \'.xlsx\' files.') else: adata = read_excel(filename, sheet) elif ext == 'mtx': adata = read_mtx(filename) elif ext == 'csv': adata = read_csv(filename, first_column_names=first_column_names) elif ext in {'txt', 'tab', 'data', 'tsv'}: if ext == 'data': logg.msg('... assuming \'.data\' means tab or white-space ' 'separated text file', v=3) logg.hint('change this by passing `ext` to sc.read') adata = read_text(filename, delimiter, first_column_names) elif ext == 'soft.gz': adata = _read_softgz(filename) else: raise ValueError('Unkown extension {}.'.format(ext)) if cache: logg.info('... writing an', settings.file_format_data, 'cache file to speedup reading next time') if not os.path.exists(os.path.dirname(filename_cache)): os.makedirs(os.path.dirname(filename_cache)) # write for faster reading when calling the next time adata.write(filename_cache) return adata
def load_file(filepath): t_flag = False if filepath == 'default' or filepath == 'datasets/user_uploaded/default': filepath = join_root("../datasets/default.csv") t_flag = True elif filepath == 'test': filepath = join_root('../../datasets/server/testdataset.h5ad') dataset = os.path.basename(filepath) dataset = os.path.splitext(dataset)[0] try: if filepath[-4:] == 'h5ad': adata = anndata.read_h5ad(filepath) if filepath[-3:] == 'csv': adata = anndata.read_csv(filepath) if t_flag: adata = adata.T if filepath[-4:] == 'xlsx': adata = anndata.read_excel(filepath) if filepath[-3:] == 'mtx': adata = anndata.read_mtx(filepath) if filepath[-3:] == 'txt' or filepath[-3:] == 'tab' or filepath[-4:] == 'data': adata = anndata.read_text(filepath) if filepath[-2:] == 'h5': adata = anndata.read_hdf(filepath) if filepath[-4:] == 'loom': adata = anndata.read_loom(filepath) except Exception as e: print(str(e)) raise IncorrectFileFormat( "File does not exist or file format is incorrect.") # Make sure cluster names are in proper format if 'cluster_names' in adata.uns: adata.uns['cluster_names'] = bidict(adata.uns['cluster_names']) for key in list(adata.uns['cluster_names'].keys()): adata.uns['cluster_names'][int(key)] = \ adata.uns['cluster_names'].pop(key, None) adata.uns['dataset'] = dataset return adata
def _read( filename: Path, backed=None, sheet=None, ext=None, delimiter=None, first_column_names=None, backup_url=None, cache=False, cache_compression=None, suppress_cache_warning=False, **kwargs, ): if ext is not None and ext not in avail_exts: raise ValueError('Please provide one of the available extensions.\n' f'{avail_exts}') else: ext = is_valid_filename(filename, return_ext=True) is_present = _check_datafile_present_and_download( filename, backup_url=backup_url, ) if not is_present: logg.debug(f'... did not find original file {filename}') # read hdf5 files if ext in {'h5', 'h5ad'}: if sheet is None: return read_h5ad(filename, backed=backed) else: logg.debug(f'reading sheet {sheet} from file {filename}') return read_hdf(filename, sheet) # read other file types path_cache = settings.cachedir / _slugify(filename).replace( '.' + ext, '.h5ad') # type: Path if path_cache.suffix in {'.gz', '.bz2'}: path_cache = path_cache.with_suffix('') if cache and path_cache.is_file(): logg.info(f'... reading from cache file {path_cache}') return read_h5ad(path_cache) if not is_present: raise FileNotFoundError(f'Did not find file {filename}.') logg.debug(f'reading {filename}') if not cache and not suppress_cache_warning: logg.hint('This might be very slow. Consider passing `cache=True`, ' 'which enables much faster reading from a cache file.') # do the actual reading if ext == 'xlsx' or ext == 'xls': if sheet is None: raise ValueError( "Provide `sheet` parameter when reading '.xlsx' files.") else: adata = read_excel(filename, sheet) elif ext in {'mtx', 'mtx.gz'}: adata = read_mtx(filename) elif ext == 'csv': adata = read_csv(filename, first_column_names=first_column_names) elif ext in {'txt', 'tab', 'data', 'tsv'}: if ext == 'data': logg.hint( "... assuming '.data' means tab or white-space " 'separated text file', ) logg.hint('change this by passing `ext` to sc.read') adata = read_text(filename, delimiter, first_column_names) elif ext == 'soft.gz': adata = _read_softgz(filename) elif ext == 'loom': adata = read_loom(filename=filename, **kwargs) else: raise ValueError(f'Unknown extension {ext}.') if cache: logg.info(f'... writing an {settings.file_format_data} ' 'cache file to speedup reading next time') if cache_compression is _empty: cache_compression = settings.cache_compression if not path_cache.parent.is_dir(): path_cache.parent.mkdir(parents=True) # write for faster reading when calling the next time adata.write(path_cache, compression=cache_compression) return adata
def read_dataset(path): path = str(path) tmp_path = None if path.startswith('gs://'): tmp_path = download_gs_url(path) path = tmp_path basename_and_extension = get_filename_and_extension(path) ext = basename_and_extension[1] if ext == 'mtx': x = scipy.io.mmread(path) x = scipy.sparse.csr_matrix(x.T) # look for .barcodes.txt and .genes.txt import itertools sp = os.path.split(path) obs = None for sep_ext in itertools.product(['.', '_', '-'], ['tsv', 'txt']): for prefix in ['', basename_and_extension[0] + sep_ext[0]]: f = os.path.join(sp[0], prefix + 'barcodes.' + sep_ext[1]) if os.path.isfile(f) or os.path.isfile(f + '.gz'): obs = pd.read_table(f if os.path.isfile(f) else f + '.gz', index_col=0, sep='\t', header=None) break var = None for sep_ext in itertools.product(['.', '_', '-'], ['tsv', 'txt']): for prefix in ['', basename_and_extension[0] + sep_ext[0]]: f = os.path.join(sp[0], prefix + 'genes.' + sep_ext[1]) if os.path.isfile(f) or os.path.isfile(f + '.gz'): var = pd.read_table(f if os.path.isfile(f) else f + '.gz', index_col=0, sep='\t', header=None) break if var is None: print(basename_and_extension[0] + '.genes.txt not found') var = pd.DataFrame(index=pd.RangeIndex(start=0, stop=x.shape[1], step=1)) if obs is None: print(basename_and_extension[0] + '.barcodes.txt not found') obs = pd.DataFrame(index=pd.RangeIndex(start=0, stop=x.shape[0], step=1)) cell_count, gene_count = x.shape if len(obs) != cell_count: raise ValueError("Wrong number of cells : matrix has {} cells, barcodes file has {}" \ .format(cell_count, len(obs))) if len(var) != gene_count: raise ValueError("Wrong number of genes : matrix has {} genes, genes file has {}" \ .format(gene_count, len(var))) return anndata.AnnData(X=x, obs=obs, var=var) elif ext == 'npz': obj = np.load(path) if tmp_path is not None: os.remove(tmp_path) return anndata.AnnData(X=obj['x'], obs=pd.DataFrame(index=obj['rid']), var=pd.DataFrame(index=obj['cid'])) elif ext == 'npy': x = np.load(path) if tmp_path is not None: os.remove(tmp_path) return anndata.AnnData(X=x, obs=pd.DataFrame(index=pd.RangeIndex(start=0, stop=x.shape[0], step=1)), var=pd.DataFrame(index=pd.RangeIndex(start=0, stop=x.shape[1], step=1))) elif ext == 'loom': # in loom file, convention is rows are genes :( # return anndata.read_loom(path, X_name='matrix', sparse=True) f = h5py.File(path, 'r') x = f['/matrix'] is_x_sparse = x.attrs.get('sparse') if is_x_sparse: # read in blocks of 1000 chunk_start = 0 nrows = x.shape[0] chunk_step = min(nrows, 1000) chunk_stop = chunk_step nchunks = int(np.ceil(max(1, nrows / chunk_step))) sparse_arrays = [] for chunk in range(nchunks): chunk_stop = min(nrows, chunk_stop) subset = scipy.sparse.csr_matrix(x[chunk_start:chunk_stop]) sparse_arrays.append(subset) chunk_start += chunk_step chunk_stop += chunk_step x = scipy.sparse.vstack(sparse_arrays) else: x = x[()] row_meta = {} row_attrs = f['/row_attrs'] for key in row_attrs: values = row_attrs[key][()] if values.dtype.kind == 'S': values = values.astype(str) row_meta[key] = values row_meta = pd.DataFrame(data=row_meta) if row_meta.get('id') is not None: row_meta.set_index('id', inplace=True) col_meta = {} col_attrs = f['/col_attrs'] for key in col_attrs: values = col_attrs[key][()] if values.dtype.kind == 'S': values = values.astype(str) col_meta[key] = values col_meta = pd.DataFrame(data=col_meta) if col_meta.get('id') is not None: col_meta.set_index('id', inplace=True) f.close() return anndata.AnnData(X=x, obs=row_meta, var=col_meta) elif ext == 'h5ad': return anndata.read_h5ad(path) elif ext == 'hdf5' or ext == 'h5': return anndata.read_hdf(path) elif ext == 'gct': ds = wot.io.read_gct(path) if tmp_path is not None: os.remove(tmp_path) return ds else: # txt with open(path) as fp: row_ids = [] header = fp.readline() sep = None for s in ['\t', ',', ' ']: test_tokens = header.split(s) if len(test_tokens) > 1: sep = s column_ids = test_tokens break if sep is None: sep = '\t' column_ids = column_ids[1:] column_ids[len(column_ids) - 1] = column_ids[ len(column_ids) - 1].rstrip() i = 0 np_arrays = [] for line in fp: line = line.rstrip() if line != '': tokens = line.split(sep) row_ids.append(tokens[0]) np_arrays.append(np.array(tokens[1:], dtype=np.float64)) i += 1 if tmp_path is not None: os.remove(tmp_path) return anndata.AnnData(X=np.array(np_arrays), obs=pd.DataFrame(index=row_ids), var=pd.DataFrame(index=column_ids))