def score_gene_sets(ds, gs, z_score_ds=True, use_dask=False): if use_dask: import dask.array as np else: import numpy as np # gene sets has genes on rows, sets on columns # ds has cells on rows, genes on columns gs_x = gs.x ds_x = ds.x if z_score_ds: ds_x = ds_x.toarray() if scipy.sparse.isspmatrix(ds_x) else ds_x gene_indices = (gs_x.sum(axis=1) > 0) & ( ds_x.std(axis=0) > 0 ) # keep genes that are in gene sets and have standard deviation > 0 gs_x = gs_x[gene_indices] ds_x = ds_x[:, gene_indices] if z_score_ds: ds_x = ds_x.toarray() if scipy.sparse.isspmatrix(ds_x) else ds_x std = np.std(ds_x, axis=0) mean = np.mean(ds_x, axis=0) ds_x = (ds_x - mean) / std ds_x[ds_x < -5] = -5 ds_x[ds_x > 5] = 5 ds_x[ds_x == np.nan] = 0 scores = ds_x.dot(gs_x) ngenes_in_set = gs_x.sum(axis=0) ngenes_in_set[ngenes_in_set == 0] = 1 # avoid divide by zero scores = scores / ngenes_in_set # scores contains cells on rows, gene sets on columns return wot.Dataset(x=scores, row_meta=ds.row_meta, col_meta=gs.col_meta)
def read_gct(file_path): """ The main method. Args: - file_path (string): full path to gct file you want to parse Returns: wot.Dataset """ # Read version and dimensions (version, num_data_rows, num_data_cols, num_row_metadata, num_col_metadata) = read_version_and_dims(file_path) # Read in metadata and data (row_metadata, col_metadata, data) = parse_into_3( file_path, num_data_rows, num_data_cols, num_row_metadata, num_col_metadata) row_metadata.index.name = None col_metadata.index.name = None row_metadata.columns.name = None col_metadata.columns.name = None return wot.Dataset(data, row_metadata, col_metadata)
def compute_dataset_name_to_trends(trajectory_results, unaligned_datasets, dataset_names, value_transform=None): """ Args: trajectory_results (list): Results from trajectory_for_cell_sets_at_time_t unaligned_datasets (list): List of datasets that can be unaligned with transport map matrices to compute the mean and variance dataset_names (list): List of dataset names value_transform (function) A function to transform the values in the dataset that takes a numpy array and returns a numpy array of the same shape. Return: Dict that maps dataset name to trends. A trend is a dict of mean, variance, times, ncells, cell_set, and features. Means and variance have time on rows and features on columns. """ cell_set_name_to_trajectories = wot.ot.Trajectory.group_trajectories_by_cell_set( trajectory_results) dataset_name_to_trends = {} for ds_index in range(len(unaligned_datasets)): unaligned_ds = unaligned_datasets[ds_index] trends = [] for cell_set_name in cell_set_name_to_trajectories: means = [] variances = [] times = [] ncells = [] trajectories = cell_set_name_to_trajectories[cell_set_name] # trajectories are sorted by time for trajectory in trajectories: p = trajectory['p'] cell_ids = trajectory['cell_ids'] times.append(trajectory['t']) ncells.append(len(cell_ids)) # align dataset with cell_ids ds_order = unaligned_ds.row_meta.index.get_indexer_for( cell_ids) ds_order = ds_order[ds_order != -1] aligned_dataset = wot.Dataset( unaligned_ds.x[ds_order], unaligned_ds.row_meta.iloc[ds_order], unaligned_ds.col_meta) mean_and_variance = TrajectoryTrends.__weighted_average( weights=p, ds=aligned_dataset, value_transform=value_transform) means.append(mean_and_variance['mean']) variances.append(mean_and_variance['variance']) mean = np.array(means) variance = np.array(variances) trends.append({ 'mean': mean, 'variance': variance, 'times': times, 'ncells': ncells, 'cell_set': cell_set_name, 'features': unaligned_ds.col_meta.index.values }) dataset_name_to_trends[dataset_names[ds_index]] = trends return dataset_name_to_trends
def filter_ds_from_command_line(ds, args): params = vars(args) if params.get('gene_filter') is not None: prior = ds.x.shape[1] gene_ids = pd.read_table(args.gene_filter, index_col=0, header=None).index.values column_indices = ds.col_meta.index.isin(gene_ids) nkeep = np.sum(column_indices) if params.get('verbose') and len(gene_ids) > nkeep: print( str(len(gene_ids) - nkeep) + ' are in gene filter, but not in matrix') ds = wot.Dataset(ds.x[:, column_indices], ds.row_meta, ds.col_meta.iloc[column_indices]) if params.get('verbose'): print('Keeping ' + str(ds.x.shape[1]) + '/' + str(prior) + ' genes') if params.get('cell_filter') is not None: prior = ds.x.shape[0] if not os.path.isfile(args.cell_filter): import re expr = re.compile(args.cell_filter) cell_ids = [ elem for elem in ds.row_meta.index.values if expr.match(elem) ] else: cell_ids = pd.read_table(args.cell_filter, index_col=0, header=None).index.values # row_indices = np.isin(ds.row_meta.index.values, cell_ids, assume_unique=True) row_indices = ds.row_meta.index.isin(cell_ids) nkeep = np.sum(row_indices) if params.get('verbose') and len(cell_ids) > nkeep: print( str(len(cell_ids) - nkeep) + ' are in cell filter, but not in matrix') ds = wot.Dataset(ds.x[row_indices], ds.row_meta.iloc[row_indices], ds.col_meta) if params.get('verbose'): print('Keeping ' + str(ds.x.shape[0]) + '/' + str(prior) + ' cells') return ds
def read_gmt(path, feature_ids=None): with open(path) as fp: row_id_lc_to_index = {} row_id_lc_to_row_id = {} if feature_ids is not None: for i in range(len(feature_ids)): fid = feature_ids[i].lower() row_id_lc_to_index[fid] = i row_id_lc_to_row_id[fid] = feature_ids[i] members_array = [] set_descriptions = [] set_names = [] for line in fp: if line == '' or line[0] == '#': continue tokens = line.split('\t') if len(tokens) < 3: continue set_names.append(tokens[0].strip()) description = tokens[1].strip() if 'BLANK' == description: description = '' set_descriptions.append(description) ids = tokens[2:] ids_in_set = [] members_array.append(ids_in_set) for i in range(len(ids)): value = ids[i].strip() if value != '': value_lc = value.lower() row_index = row_id_lc_to_index.get(value_lc) if feature_ids is None: if row_index is None: row_id_lc_to_row_id[value_lc] = value row_index = len(row_id_lc_to_index) row_id_lc_to_index[value_lc] = row_index if row_index is not None: ids_in_set.append(value) if feature_ids is None: feature_ids = np.empty(len(row_id_lc_to_index), dtype='object') for rid_lc in row_id_lc_to_index: feature_ids[ row_id_lc_to_index[rid_lc]] = row_id_lc_to_row_id[rid_lc] x = np.zeros(shape=(len(feature_ids), len(set_names)), dtype=np.int8) for j in range(len(members_array)): ids = members_array[j] for id in ids: row_index = row_id_lc_to_index.get(id.lower()) x[row_index, j] = 1 row_meta = pd.DataFrame(index=feature_ids) col_meta = pd.DataFrame(data={'description': set_descriptions}, index=set_names) return wot.Dataset(x=x, row_meta=row_meta, col_meta=col_meta)
def read_transport_maps(input_dir, ids=None, time=None): transport_maps_inputs = [] # file, start, end is_pattern = not os.path.isdir(input_dir) files = os.listdir(input_dir) if not is_pattern else glob.glob(input_dir) for path in files: path = os.path.join(os.path.dirname(input_dir), path) if not is_pattern else path if os.path.isfile(path): file_info = wot.io.get_filename_and_extension( os.path.basename(path)) basename = file_info[0] tokens = basename.split('_') t1 = tokens[len(tokens) - 2] t2 = tokens[len(tokens) - 1] try: t1 = float(t1) t2 = float(t2) except ValueError: continue ds = wot.io.read_dataset(path) if ids is not None and t1 == time: # subset rows indices = ds.row_meta.index.isin(ids) ds = wot.Dataset(ds.x[indices], ds.row_meta.iloc[indices], ds.col_meta) if ids is not None and t2 == time: # subset columns indices = ds.col_meta.index.isin(ids) ds = wot.Dataset(ds.x[:, indices], ds.row_meta, ds.col_meta.iloc[indices]) transport_maps_inputs.append({ 'transport_map': ds, 't1': t1, 't2': t2 }) transport_maps_inputs.sort( key=lambda x: x['t1']) # sort by t1 (start time) return transport_maps_inputs
def test_score_gene_sets(self): ds = wot.Dataset(x=np.array([[1, 2, 3, 0], [4, 5, 6, 0]]), row_meta=pd.DataFrame( index=['c1', 'c2']), col_meta=pd.DataFrame( index=['g1', 'g2', 'g3', 'g4'])) gs = wot.Dataset(x=np.array([[1, 0, 1], [0, 0, 1], [0, 0, 0], [0, 1, 0] ], dtype=np.uint8), row_meta=pd.DataFrame( index=['g1', 'g2', 'g3', 'g4']), col_meta=pd.DataFrame( index=['s1', 's2', 's3'])) result = wot.score_gene_sets(ds=ds, gs=gs, z_score_ds=False) np.testing.assert_array_equal(result.x, np.array([[1, 0, 1.5], [4, 0, 4.5]]))
def read_gmx(path, feature_ids=None): with open(path) as fp: set_ids = fp.readline().split('\t') descriptions = fp.readline().split('\t') nsets = len(set_ids) for i in range(len(set_ids)): set_ids[i] = set_ids[i].rstrip() row_id_lc_to_index = {} row_id_lc_to_row_id = {} x = None array_of_arrays = None if feature_ids is not None: for i in range(len(feature_ids)): fid = feature_ids[i].lower() row_id_lc_to_index[fid] = i row_id_lc_to_row_id[fid] = feature_ids[i] x = np.zeros(shape=(len(feature_ids), nsets), dtype=np.int8) else: array_of_arrays = [] for line in fp: tokens = line.split('\t') for j in range(nsets): value = tokens[j].strip() if value != '': value_lc = value.lower() row_index = row_id_lc_to_index.get(value_lc) if feature_ids is None: if row_index is None: row_id_lc_to_row_id[value_lc] = value row_index = len(row_id_lc_to_index) row_id_lc_to_index[value_lc] = row_index array_of_arrays.append( np.zeros(shape=(nsets, ), dtype=np.int8)) array_of_arrays[row_index][j] = 1 elif row_index is not None: x[row_index, j] = 1 if feature_ids is None: feature_ids = np.empty(len(row_id_lc_to_index), dtype='object') for rid_lc in row_id_lc_to_index: feature_ids[ row_id_lc_to_index[rid_lc]] = row_id_lc_to_row_id[rid_lc] if array_of_arrays is not None: x = np.array(array_of_arrays) row_meta = pd.DataFrame(index=feature_ids) col_meta = pd.DataFrame(data={'description': descriptions}, index=set_ids) return wot.Dataset(x, row_meta=row_meta, col_meta=col_meta)
def merge_datasets(*args): datasets = list(args) merged_x = numpy.concatenate([d.x for d in datasets]) row_columns = set(datasets[0].row_meta.columns) if not all([set(d.row_meta.columns) == row_columns for d in datasets]): raise ValueError( "Unable to merge: incompatible metadata between datasets") merged_row_meta = pandas.concat([d.row_meta for d in datasets], sort=True) if merged_row_meta.index.duplicated().any(): raise ValueError( "Unable to merge: duplicate rows between datasets, cannot lose information" ) col_index = datasets[0].col_meta.index if not all([d.col_meta.index.equals(col_index) for d in datasets]): raise ValueError( "Unable to merge: incompatible genes between datasets") merged_col_meta = datasets[0].col_meta return wot.Dataset(merged_x, merged_row_meta, merged_col_meta)
def read_grp(path, feature_ids=None): with open(path) as fp: row_id_lc_to_index = {} row_id_lc_to_row_id = {} if feature_ids is not None: for i in range(len(feature_ids)): fid = feature_ids[i].lower() row_id_lc_to_index[fid] = i row_id_lc_to_row_id[fid] = feature_ids[i] ids_in_set = set() for line in fp: if line == '' or line[0] == '#': continue value = line.strip() if value != '': value_lc = value.lower() row_index = row_id_lc_to_index.get(value_lc) if feature_ids is None: if row_index is None: row_id_lc_to_row_id[value_lc] = value row_index = len(row_id_lc_to_index) row_id_lc_to_index[value_lc] = row_index if row_index is not None: ids_in_set.add(value) if feature_ids is None: feature_ids = np.empty(len(row_id_lc_to_index), dtype='object') for rid_lc in row_id_lc_to_index: feature_ids[ row_id_lc_to_index[rid_lc]] = row_id_lc_to_row_id[rid_lc] x = np.zeros(shape=(len(feature_ids), 1), dtype=np.int8) for id in ids_in_set: row_index = row_id_lc_to_index.get(id.lower()) x[row_index, 0] = 1 row_meta = pd.DataFrame(index=feature_ids) col_meta = pd.DataFrame(index=[ wot.io.get_filename_and_extension(os.path.basename(path))[0] ]) return wot.Dataset(x=x, row_meta=row_meta, col_meta=col_meta)
def dataset_from_x(x, rows=None, columns=None, row_prefix="cell_", column_prefix="gene_"): if rows is None: row_count_len = math.floor(math.log10(x.shape[0])) + 1 rows = [ "{}{:0{}}".format(row_prefix, i, row_count_len) for i in range(x.shape[0]) ] if columns is None: col_count_len = math.floor(math.log10(x.shape[1])) + 1 columns = [ "{}{:0{}}".format(column_prefix, i, col_count_len) for i in range(x.shape[1]) ] return wot.Dataset(x, pandas.DataFrame([], index=rows, columns=[]), pandas.DataFrame([], index=columns, columns=[]))
def read_dataset(path, chunks=(500, 500), use_dask=False, genome10x=None, row_filter=None, col_filter=None, force_sparse=False, backed=False): path = str(path) basename_and_extension = get_filename_and_extension(path) ext = basename_and_extension[1] if ext == 'mtx': # look for .barcodes.txt and .genes.txt sp = os.path.split(path) row_meta = None for f in (os.path.join(sp[0], basename_and_extension[0] + '.barcodes.tsv'), os.path.join(sp[0], basename_and_extension[0] + '.barcodes.txt'), os.path.join(sp[0], 'barcodes.tsv')): if os.path.isfile(f) or os.path.isfile(f + '.gz'): row_meta = pd.read_table(f if os.path.isfile(f) else f + '.gz', index_col=0, sep='\t', header=None) break col_meta = None for f in (os.path.join(sp[0], basename_and_extension[0] + '.genes.tsv'), os.path.join(sp[0], basename_and_extension[0] + '.genes.txt'), os.path.join(sp[0], 'genes.tsv')): if os.path.isfile(f) or os.path.isfile(f + '.gz'): col_meta = pd.read_table(f if os.path.isfile(f) else f + '.gz', index_col=0, sep='\t', header=None) break x = scipy.io.mmread(path) x = scipy.sparse.csr_matrix(x.T) if col_meta is None: print(basename_and_extension[0] + '.genes.txt not found') col_meta = pd.DataFrame( index=pd.RangeIndex(start=0, stop=x.shape[1], step=1)) if row_meta is None: print(basename_and_extension[0] + '.barcodes.txt not found') row_meta = pd.DataFrame( index=pd.RangeIndex(start=0, stop=x.shape[0], step=1)) return wot.Dataset(x=x, row_meta=row_meta, col_meta=col_meta) elif ext == 'hdf5' or ext == 'h5' or ext == 'loom' or ext == 'h5ad': f = h5py.File(path, 'r') if ext == 'h5ad': h5_x = '/X' h5_row_meta = '/obs' h5_col_meta = '/var' elif ext == 'loom': h5_x = '/matrix' h5_row_meta = '/row_attrs' h5_col_meta = '/col_attrs' else: if genome10x is None: keys = list(f.keys()) if len(keys) > 0: genome10x = keys[0] group = f['/' + genome10x] M, N = group['shape'][()] data = group['data'][()] x = scipy.sparse.csr_matrix( (data, group['indices'][()], group['indptr'][()]), shape=(N, M)) col_meta = pd.DataFrame( index=group['gene_names'][()].astype(str), data={'ensembl': group['genes'][()].astype(str)}) row_meta = pd.DataFrame(index=group['barcodes'][()].astype(str)) f.close() return wot.Dataset(x=x, row_meta=row_meta, col_meta=col_meta) if ext == 'h5ad': row_meta = pd.DataFrame.from_records(f[h5_row_meta][()], index='index') col_meta = pd.DataFrame.from_records(f[h5_col_meta][()], index='index') row_meta.index = row_meta.index.values.astype(str) col_meta.index = col_meta.index.values.astype(str) else: row_attrs = read_h5_attrs(f, h5_row_meta, row_filter) nrows = len( row_attrs['indices'] ) if row_attrs['indices'] is not None else f[h5_x].shape[0] row_meta = pd.DataFrame(row_attrs['attrs'], index=pd.RangeIndex(start=0, stop=nrows, step=1)) if row_meta.get('id') is not None: row_meta.set_index('id', inplace=True) col_attrs = read_h5_attrs(f, h5_col_meta, col_filter) ncols = len( col_attrs['indices'] ) if col_attrs['indices'] is not None else f[h5_x].shape[1] col_meta = pd.DataFrame(col_attrs['attrs'], index=pd.RangeIndex(start=0, stop=ncols, step=1)) if col_meta.get('id') is not None: col_meta.set_index('id', inplace=True) if not use_dask: x = f[h5_x] is_x_sparse = False if type(x) == h5py.Group: data = x['data'][()] x = scipy.sparse.csr_matrix( (data, x['indices'][()], x['indptr'][()]), shape=x.attrs['h5sparse_shape']) backed = False else: is_x_sparse = x.attrs.get('sparse') if not backed and (is_x_sparse or force_sparse) and ( row_filter is None and col_filter is None): # read in blocks of 1000 chunk_start = 0 chunk_step = min(nrows, 1000) chunk_stop = chunk_step nchunks = int(np.ceil(max(1, nrows / chunk_step))) sparse_arrays = [] for chunk in range(nchunks): chunk_stop = min(nrows, chunk_stop) subset = scipy.sparse.csr_matrix( x[chunk_start:chunk_stop]) sparse_arrays.append(subset) chunk_start += chunk_step chunk_stop += chunk_step x = scipy.sparse.vstack(sparse_arrays) else: if row_filter is None and col_filter is None and not backed: x = x[()] elif row_filter is not None and col_filter is not None: x = x[row_attrs['indices']] x = x[:, col_attrs['indices']] elif row_filter is not None: x = x[row_attrs['indices']] elif col_filter is not None: x = x[:, col_attrs['indices']] if not backed and (is_x_sparse or force_sparse): x = scipy.sparse.csr_matrix(x) if not backed: f.close() return wot.Dataset(x=x, row_meta=row_meta, col_meta=col_meta) else: import dask.array as da x = da.from_array(f[h5_x], chunks=chunks) # TODO load in chunks # row_meta = dd.from_pandas(row_meta, npartitions=4, sort=False) # col_meta = dd.from_pandas(col_meta, npartitions=4, sort=False) return wot.Dataset(x=x, row_meta=row_meta, col_meta=col_meta) elif ext == 'gct': return wot.io.read_gct(path) else: with open(path) as fp: row_ids = [] header = fp.readline() for s in ['\t', ' ', ',']: test = header.split(s) if len(test) > 1: sep = s column_ids = test break column_ids = column_ids[1:] column_ids[len(column_ids) - 1] = column_ids[len(column_ids) - 1].rstrip() i = 0 np_arrays = [] for line in fp: line = line.rstrip() if line != '': tokens = line.split(sep) row_ids.append(tokens[0]) np_arrays.append(np.array(tokens[1:], dtype=np.float64)) i += 1 return wot.Dataset(x=np.array(np_arrays), row_meta=pd.DataFrame(index=row_ids), col_meta=pd.DataFrame(index=column_ids))
def extract_cells_at_indices(ds, indices): return wot.Dataset(ds.x[indices], ds.row_meta.iloc[indices].copy(), ds.col_meta.copy())