def export_marker(marker: Mapping[str, Iterable[Tuple[str, str]]], fname: str, defaultpath: Optional[bool] = True): """Export marker pairs to json-File. Parameters ---------- marker Marker pairs as from :func:`~pypairs.pairs.sandbag` fname Name of the json-File in the writedir (see settings) defaultpath Use settings.writedir as root. Default: True """ if defaultpath: fpath = settings.writedir + fname else: fpath = fname try: write_dict_to_json(marker, fpath) logg.hint("marker pairs written to: " + str(fpath)) except IOError as e: msg = "could not write to {}.".format(fpath) + \ "Please verify that the path exists and is writable. Or change `writedir` via `pypairs.settings.writedir`" logg.error(msg) logg.error(str(e))
def filter_unexpressed_genes(data, gene_names): mask = np.invert(np.all(data == 0, axis=0)) x = data[:, mask] gene_names = np.array(gene_names)[mask] if sum(mask) != len(mask): logg.hint( "filtered out {} unexpressed genes".format(len(mask) - sum(mask))) return x, list(gene_names)
def filter_marker_pairs(marker_pairs, gene_names): gene_name_to_idx = {g: i for i, g in enumerate(gene_names)} removed = 0 marker_pairs_idx = {} used_masks = {} for cat, pairs in marker_pairs.items(): used_mask = np.zeros(len(gene_names), dtype='bool') for pair in pairs: try: g1_idx = gene_name_to_idx[pair[0]] g2_idx = gene_name_to_idx[pair[1]] used_mask[g1_idx] = True used_mask[g2_idx] = True except KeyError: removed += 1 used_masks[cat] = used_mask used_idx = np.where(used_mask)[0].tolist() new_idx = {u: i for i, u in enumerate(used_idx)} new_pairs_idx = [] for pair in pairs: try: g1_idx = gene_name_to_idx[pair[0]] g2_idx = gene_name_to_idx[pair[1]] new_pairs_idx.append([new_idx[g1_idx], new_idx[g2_idx]]) except KeyError: #logg.hint("genepair ({}, {}) not present in dataset".format(pair[0], pair[1])) # producing to much output.. pass marker_pairs_idx[cat] = np.array(new_pairs_idx) logg.hint("translated marker pairs, {} removed".format(removed)) return marker_pairs_idx, used_masks
def get_filter_masks(data, gene_names, sample_names, categories, filter_genes, filter_samples): dim_befor_filter = data.shape filtered = False unexpressed_genes = np.invert(np.all(data == 0, axis=0)) filtered_genes = to_boolean_mask(filter_genes, gene_names) gene_mask = np.logical_and(unexpressed_genes, filtered_genes) filtered_samples = to_boolean_mask(filter_samples, sample_names) not_categorized = categories[0] for i in range(1, categories.shape[0]): not_categorized = np.logical_or(not_categorized, categories[i]) sample_mask = np.logical_and(not_categorized, filtered_samples) if filtered: logg.hint( "filtered out {} samples and {} genes based on passed subsets". format(dim_befor_filter[0] - np.sum(sample_mask), dim_befor_filter[1] - np.sum(gene_mask))) logg.hint("new data is of shape {} x {}".format(*data.shape)) return gene_mask, sample_mask
def parse_data( data: Union[AnnData, DataFrame, np.ndarray], gene_names: Optional[Iterable[str]] = None, sample_names: Optional[Iterable[str]] = None ) -> Tuple[np.ndarray, list, list]: """Reduces :class:`~anndata.AnnData` and :class:`~pandas.DataFrame` to a :class:`~numpy.dnarray` and extracts `gene_names` and `sample_names` from index and column names.""" if isinstance(data, AnnData): if sample_names is None: sample_names = list(data.obs_names) if gene_names is None: gene_names = list(data.var_names) raw_data = data.X else: if isinstance(data, DataFrame): if gene_names is None: gene_names = list(data.columns) if sample_names is None: sample_names = list(data.index) raw_data = data.values elif isinstance(data, np.ndarray): if gene_names is None or sample_names is None: raise ValueError( "Provide gene names and sample names in ``gene_names`` and ``sample_names``" ) raw_data = data else: raise ValueError( "data can only be of type AnnData, DataFrame or ndarray") logg.hint("passed data of shape {} x {} (samples x genes)".format( *raw_data.shape)) return raw_data, gene_names, sample_names
def load_marker(fname: str, defaultpath: Optional[bool] = True): """Export marker pairs to json-File. Parameters ---------- fname Name of the json-File to write to defaultpath Use settings.writedir as root. Default: True """ if defaultpath: fpath = settings.writedir + fname else: fpath = fname try: marker = read_dict_from_json(fpath) except IOError: logg.error( "could not read from {}.\n Please verify that the path exists and is writable." .format(fpath)) return None if settings.verbosity > 2: count_total = 0 count_str = [] for m, p in marker.items(): c = len(p) count_total += c count_str.append("\t{}: {}".format(m, c)) logg.hint("loaded {} marker pairs".format(count_total)) for s in count_str: logg.hint(s) return marker
def parallel_njit(func: Callable[[], Any], jitted: Optional[bool] = True) -> Callable[[], Any]: """Dynamic decorator for jit-compiled functions. Adds parallel=True if settings.n_jobs > 1 """ if jitted is False or settings.enable_jit is False: logg.warn( 'staring uncompiled processing. Should only be used for debug and testing!' ) return func if settings.n_jobs > 1: if is_win32() is False: logg.hint('staring parallel processing with {} threads'.format( settings.n_jobs)) return njit(func, parallel=True, fastmath=settings.enable_fastmath) else: logg.error( 'n_jobs is set to {} but multiprocessing is not supported for your platform! ' 'falling back to single core... '.format(settings.n_jobs)) return njit(func, fastmath=settings.enable_fastmath) else: logg.hint('staring processing with 1 thread') return njit(func, fastmath=settings.enable_fastmath)
def parse_data_and_annotation( data: Union[AnnData, DataFrame, np.ndarray, Iterable[Iterable[float]]], annotation: Optional[Mapping[str, Iterable[Union[str, int, bool]]]] = None, gene_names: Optional[Iterable[str]] = None, sample_names: Optional[Iterable[str]] = None ) -> Tuple[np.ndarray, list, list, np.ndarray, np.ndarray]: raw_data, gene_names, sample_names = parse_data(data, gene_names, sample_names) if isinstance(data, AnnData): if annotation: category_names, categories = parse_annotation( annotation, sample_names) else: if 'category' in data.obs_keys(): category_names = np.unique(data.obs['category']) categories = np.ndarray(shape=(len(category_names), len(sample_names)), dtype=bool) logg.hint("passed {} categories: {}".format( len(category_names), str(category_names))) for i, name in enumerate(category_names): categories[i] = np.isin(data.obs['category'], name) if type(categories[i][0]) == bool or type( categories[i][0]) == np.bool_: logg.hint("\t{}: {}".format(name, sum(categories[i]))) else: logg.hint("\t{}: {}".format(name, len(categories[i]))) else: raise ValueError( "Provide categories as data.var['category'] or in ``annotation``" ) else: if annotation: category_names, categories = parse_annotation( annotation, sample_names) else: raise ValueError("Provide categories in ``annotation``") return raw_data, gene_names, sample_names, category_names, categories
def parse_annotation( annotation: Mapping[str, Iterable[Union[str, int, bool]]], sample_names: Iterable[str]) -> Tuple[np.ndarray, np.ndarray]: """ Translates a dictionary annotation {'category': [sample1, sample2, ...], ..} into a list of boolean masks. Accepts index, names and boolean.""" category_names = np.array(list(annotation.keys())) categories = np.ndarray(shape=(len(category_names), len(sample_names)), dtype=bool) logg.hint("passed {} categories: {}".format(len(category_names), str(category_names))) for i, k in enumerate(annotation.keys()): if type(annotation[k][0]) == bool or type( annotation[k][0]) == np.bool_: logg.hint("\t{}: {}".format(k, sum(annotation[k]))) else: logg.hint("\t{}: {}".format(k, len(annotation[k]))) categories[i] = to_boolean_mask(np.array(annotation[k]), sample_names) return category_names, categories
def sandbag( data: Union[AnnData, DataFrame, np.ndarray, Collection[Collection[float]]], annotation: Optional[Mapping[str, Collection[Union[str, int, bool]]]] = None, gene_names: Optional[Collection[str]] = None, sample_names: Optional[Collection[str]] = None, fraction: float = 0.65, filter_genes: Optional[Collection[Union[str, int, bool]]] = None, filter_samples: Optional[Collection[Union[str, int, bool]]] = None ) -> Mapping[str, Collection[Tuple[str, str]]]: """ Calculate 'marker pairs' from a genecount matrix. Cells x Genes. A Pair of genes `(g1, g2)` is considered a marker for a category if its expression changes from `g1 > g2` in one category to `g1 < g2` in all other categories, for at least a ``fraction`` of cells in this category. ``data`` can be of type :class:`~anndata.AnnData`, :class:`~pandas.DataFrame` or :class:`~numpy.ndarray` and should contain the raw or normalized gene counts of shape ``n_obs`` * ``n_vars``. Rows correspond to cells and columns to genes. * If data is :class:`~anndata.AnnData` object, the category for each sample should be in in ``data.vars['category']``, gene names in ``data.var_names`` and sample names in ``data.obs_names``. * If data is :class:`~pandas.DataFrame` object, gene names can be in ``df.columns`` or passed via ``gene_names`` and sample names in ``df.index`` or passed via ``sample_names``. The category for each sample must be passed via ``annotation``. * ``annotation`` must be in form of `{'category1': ['sample_1','sample_2',...], ...}`. List of samples for indexing can be integer, str or a boolean mask of ``len(sample_names)``. * If data :class:`~numpy.ndarray`, all information must be passed via ``annotation``, ``gene_names`` and ``sample_names`` parameters. Marker pairs are returned as a mapping from category to list of 2-tuple Genes: `{'category': [(Gene_1,Gene_2), ...], ...}` Parameters ---------- data The (annotated) data matrix of shape ``n_obs`` * ``n_vars``. Rows correspond to cells and columns to genes. annotation Mapping from category to genes. If ``data`` is not :class:`~anndata.AnnData`, this is required. List of genes can be index, names or logical mask. gene_names Names for genes, must be same length as ``n_vars``. If ``data`` is not :class:`~anndata.AnnData`, this is required. sample_names Names for samples, must be same length as ``n_obs``. If ``data`` is not :class:`~anndata.AnnData`, this is required. fraction Fraction of cells per category where marker criteria must be satisfied. Default: 0.65 filter_genes A list of genes to keep. If not ``None`` all genes not in this list will be removed. List can be index, names or logical mask. filter_samples A list of samples to keep. If not ``None`` all samples not in this list will be removed. List can be index, names or logical mask. Returns ------- marker_pairs_dict A dict mapping from str to a list of 2-tuple, where the key is the category and the list contains the marker pairs: `{'Category_1': [(Gene_1, Gene_2), ...], ...}`. Examples -------- To generate marker pairs for a different fraction than the default (0.65) based on the bundled ``oscope``-dataset [Leng15]_ run:: from pypairs import pairs, datasets adata = datasets.leng15() marker_pairs = pairs.sandbag(adata, fraction=0.5) """ logg.info('identifying marker pairs with sandbag', r=True) logg.hint('sandbag running with fraction of {}'.format(fraction)) # AnnData or DataFrame or ndarray -> ndarray + meta information data, gene_names, sample_names, category_names, categories = utils.parse_data_and_annotation( data, annotation, gene_names, sample_names) # Get filter mask based on filter selection, and filter out unexpressed genes gene_mask, sample_mask = utils.get_filter_masks(data, gene_names, sample_names, categories, filter_genes, filter_samples) # Apply mask to gene names and categories, samples are not needed gene_names = np.array(gene_names)[gene_mask] categories = categories[:, sample_mask] # Remove empty categories categories, category_names = remove_empty_categories( categories, category_names) # Cells in category * fraction thresholds = calc_thresholds(categories, fraction) # Turn array of boolean into array of index cats = np.where(categories.T == True)[1] data = data.astype(float) # Decorate check_pairs according to settings and platform check_pairs_decorated = utils.parallel_njit(check_pairs) pairs = check_pairs_decorated(data[sample_mask][:, gene_mask], cats, thresholds) # Convert to easier to read dict and return marker_pos = np.where(pairs != -1) marker_pairs_dict = defaultdict(list) for i in range(0, len(marker_pos[0])): g1 = marker_pos[0][i] g2 = marker_pos[1][i] cat = pairs[g1, g2] marker_pairs_dict[category_names[cat]].append( (gene_names[g1], gene_names[g2])) logg.info('finished', time=True) # Print count of marker pairs per category if settings.verbosity > 2: count_total = 0 count_str = [] for m, p in marker_pairs_dict.items(): c = len(p) count_total += c count_str.append("\t{}: {}".format(m, c)) logg.hint("found {} marker pairs".format(count_total)) for s in count_str: logg.hint(s) return dict(marker_pairs_dict)
def cyclone(data: Union[AnnData, DataFrame, np.ndarray, Collection[Collection[float]]], marker_pairs: Optional[Mapping[str, Collection[Tuple[str, str]]]] = None, gene_names: Optional[Collection[str]] = None, sample_names: Optional[Collection[str]] = None, iterations: Optional[int] = 1000, min_iter: Optional[int] = 100, min_pairs: Optional[int] = 50) -> DataFrame: """Score samples for each category based on marker pairs. ``data`` can be of type :class:`~anndata.AnnData`, :class:`~pandas.DataFrame` or :class:`~numpy.ndarray` and should contain the raw or normalized gene counts of shape ``n_obs`` * ``n_vars``. Rows correspond to cells and columns to genes. * If a :class:`~anndata.AnnData` object is passed, the category scores and the final prediction will be added to ``data.obs`` with key ``pypairs_{category}_score`` and ``pypairs_max_class``. * If marker pairs contain only the cell cycle categories G1, S and G2M an additional column ``pypairs_cc_prediction`` will be added. Where category S is assigned to samples where G1 and G2M score are below 0.5, as described in [Scialdone15]_. ``marker_pairs``, i.e. output from :func:`~pypairs.tools.sandbag()`, must be a mapping from category to list of 2-tuple Genes: `{'category': [(Gene_1,Gene_2), ...], ...}`. * If no ``marker_pairs`` are passed the default are used from :func:`~pypairs.datasets.default_marker()` based on [Leng15]_ (marker pairs for cell cycle prediction). Parameters ---------- data The (annotated) data matrix of shape ``n_obs`` * ``n_vars``. Rows correspond to cells and columns to genes. marker_pairs A dict mapping from str to a list of 2-tuple, where the key is the category and the list contains the marker pairs: {'Category_1': [(Gene_1, Gene_2), ...], ...}. If not provided default marker pairs are used gene_names Names for genes, must be same length as ``n_vars``. sample_names Names for samples, must be same length as ``n_obs``. iterations An integer specifying the number of iterations for random sampling to obtain a cycle score. Default: 1000 min_iter An integer specifying the minimum number of iterations for score estimation. Default: 100 min_pairs An integer specifying the minimum number of pairs for cycle estimation. Default: 50 Returns ------- A :class:`~pandas.DataFrame` with samples as index and categories as columns with scores for each category for each sample and a additional column with the name of the max scoring category for each sample. * If marker pairs contain only the cell cycle categories G1, S and G2M an additional column ``pypairs_cc_prediction`` will be added. Where category S is assigned to samples where G1 and G2M score are below 0.5, as described in [Scialdone15]_. Examples -------- To predict the cell cycle phase of the unsorted cell from the [Leng15]_ dataset run:: import pypairs import pairs, datasets adata = datasets.leng15('unsorted') marker_pairs = datasets.default_cc_marker() scores = pairs.cyclone(adata, marker_pairs) print(scores) """ logg.info('predicting category scores with cyclone', r=True) if marker_pairs is None: logg.hint( 'no marker pairs passed, using default cell cycle prediction marker' ) marker_pairs = datasets.default_cc_marker() raw_data, gene_names, sample_names = utils.parse_data( data, gene_names, sample_names) # Filter marker pairs to those where both genes are present in `data` marker_pairs, used = filter_marker_pairs(marker_pairs, gene_names) logg.hint('staring processing with {} thread'.format(settings.n_jobs)) raw_data = raw_data.astype(float) get_phase_scores_decorated = utils.parallel_njit(get_phase_scores) scores = { cat: get_phase_scores_decorated(raw_data, iterations, min_iter, min_pairs, pairs, used[cat]) for cat, pairs in marker_pairs.items() } scores_df = DataFrame(scores, columns=marker_pairs.keys()) scores_df.index = sample_names scores_df['max_class'] = scores_df.idxmax(axis=1) if len(marker_pairs.items()) == 3 and all(elem in marker_pairs.keys() for elem in ["G1", "S", "G2M"]): scores_cc = scores_df.loc[:, ["G1", "G2M"]].idxmax(axis=1) scores_df['cc_prediction'] = [ "S" if x < 0.5 else scores_cc[i] for i, x in enumerate(scores_df.loc[:, ["G1", "G2M"]].max( axis=1).values) ] if isinstance(data, AnnData): logg.hint('adding scores with key "pypairs_{category}" to `data.obs`"') logg.hint( 'adding max_class with key "pypairs_max_class" to `data.obs`"') if len(marker_pairs.items()) == 3 and all( elem in marker_pairs.keys() for elem in ["G1", "S", "G2M"]): logg.hint( 'adding cc_prediction with key "pypairs_cc_prediction" to `data.obs`"' ) for name, values in scores_df.iteritems(): key_name = 'pypairs_{}'.format(name) data.obs[key_name] = values logg.info('finished', time=True) return scores_df