def benchmark_test(func, params, runs=3, repeats=3): logg.info("Benchmarking ({} runs, {} repeats):".format(runs, repeats)) settings.verbosity = 0 times = timeit.Timer(partial(func, **params)).repeat(runs, repeats) time_taken = np.array(times) / repeats settings.verbosity = 4 logg.info("\tTotal {total}, mean: {mean}, min {min}".format( total=nice_seconds(sum(time_taken)), mean=nice_seconds(time_taken.mean()), min=nice_seconds(min(time_taken))))
def is_cached(fname): if settings.cachedir is None: return False cached_fname = os.path.join(settings.cachedir, fname) if os.path.isdir(settings.cachedir): if os.path.isfile(cached_fname): return True else: return False else: try: os.mkdir(settings.cachedir) dir_abs = Path(settings.cachedir).absolute() logg.info("created specified cache dir: {}".format(dir_abs)) except OSError: logg.warn( "could not create specified cache directory: {}.\n No caching will be used for this session." " You can change `cachedir` via `pypairs.settings.cachedir`". format(settings.cachedir)) settings.cachedir = None return False
def sandbag( data: Union[AnnData, DataFrame, np.ndarray, Collection[Collection[float]]], annotation: Optional[Mapping[str, Collection[Union[str, int, bool]]]] = None, gene_names: Optional[Collection[str]] = None, sample_names: Optional[Collection[str]] = None, fraction: float = 0.65, filter_genes: Optional[Collection[Union[str, int, bool]]] = None, filter_samples: Optional[Collection[Union[str, int, bool]]] = None ) -> Mapping[str, Collection[Tuple[str, str]]]: """ Calculate 'marker pairs' from a genecount matrix. Cells x Genes. A Pair of genes `(g1, g2)` is considered a marker for a category if its expression changes from `g1 > g2` in one category to `g1 < g2` in all other categories, for at least a ``fraction`` of cells in this category. ``data`` can be of type :class:`~anndata.AnnData`, :class:`~pandas.DataFrame` or :class:`~numpy.ndarray` and should contain the raw or normalized gene counts of shape ``n_obs`` * ``n_vars``. Rows correspond to cells and columns to genes. * If data is :class:`~anndata.AnnData` object, the category for each sample should be in in ``data.vars['category']``, gene names in ``data.var_names`` and sample names in ``data.obs_names``. * If data is :class:`~pandas.DataFrame` object, gene names can be in ``df.columns`` or passed via ``gene_names`` and sample names in ``df.index`` or passed via ``sample_names``. The category for each sample must be passed via ``annotation``. * ``annotation`` must be in form of `{'category1': ['sample_1','sample_2',...], ...}`. List of samples for indexing can be integer, str or a boolean mask of ``len(sample_names)``. * If data :class:`~numpy.ndarray`, all information must be passed via ``annotation``, ``gene_names`` and ``sample_names`` parameters. Marker pairs are returned as a mapping from category to list of 2-tuple Genes: `{'category': [(Gene_1,Gene_2), ...], ...}` Parameters ---------- data The (annotated) data matrix of shape ``n_obs`` * ``n_vars``. Rows correspond to cells and columns to genes. annotation Mapping from category to genes. If ``data`` is not :class:`~anndata.AnnData`, this is required. List of genes can be index, names or logical mask. gene_names Names for genes, must be same length as ``n_vars``. If ``data`` is not :class:`~anndata.AnnData`, this is required. sample_names Names for samples, must be same length as ``n_obs``. If ``data`` is not :class:`~anndata.AnnData`, this is required. fraction Fraction of cells per category where marker criteria must be satisfied. Default: 0.65 filter_genes A list of genes to keep. If not ``None`` all genes not in this list will be removed. List can be index, names or logical mask. filter_samples A list of samples to keep. If not ``None`` all samples not in this list will be removed. List can be index, names or logical mask. Returns ------- marker_pairs_dict A dict mapping from str to a list of 2-tuple, where the key is the category and the list contains the marker pairs: `{'Category_1': [(Gene_1, Gene_2), ...], ...}`. Examples -------- To generate marker pairs for a different fraction than the default (0.65) based on the bundled ``oscope``-dataset [Leng15]_ run:: from pypairs import pairs, datasets adata = datasets.leng15() marker_pairs = pairs.sandbag(adata, fraction=0.5) """ logg.info('identifying marker pairs with sandbag', r=True) logg.hint('sandbag running with fraction of {}'.format(fraction)) # AnnData or DataFrame or ndarray -> ndarray + meta information data, gene_names, sample_names, category_names, categories = utils.parse_data_and_annotation( data, annotation, gene_names, sample_names) # Get filter mask based on filter selection, and filter out unexpressed genes gene_mask, sample_mask = utils.get_filter_masks(data, gene_names, sample_names, categories, filter_genes, filter_samples) # Apply mask to gene names and categories, samples are not needed gene_names = np.array(gene_names)[gene_mask] categories = categories[:, sample_mask] # Remove empty categories categories, category_names = remove_empty_categories( categories, category_names) # Cells in category * fraction thresholds = calc_thresholds(categories, fraction) # Turn array of boolean into array of index cats = np.where(categories.T == True)[1] data = data.astype(float) # Decorate check_pairs according to settings and platform check_pairs_decorated = utils.parallel_njit(check_pairs) pairs = check_pairs_decorated(data[sample_mask][:, gene_mask], cats, thresholds) # Convert to easier to read dict and return marker_pos = np.where(pairs != -1) marker_pairs_dict = defaultdict(list) for i in range(0, len(marker_pos[0])): g1 = marker_pos[0][i] g2 = marker_pos[1][i] cat = pairs[g1, g2] marker_pairs_dict[category_names[cat]].append( (gene_names[g1], gene_names[g2])) logg.info('finished', time=True) # Print count of marker pairs per category if settings.verbosity > 2: count_total = 0 count_str = [] for m, p in marker_pairs_dict.items(): c = len(p) count_total += c count_str.append("\t{}: {}".format(m, c)) logg.hint("found {} marker pairs".format(count_total)) for s in count_str: logg.hint(s) return dict(marker_pairs_dict)
def cyclone(data: Union[AnnData, DataFrame, np.ndarray, Collection[Collection[float]]], marker_pairs: Optional[Mapping[str, Collection[Tuple[str, str]]]] = None, gene_names: Optional[Collection[str]] = None, sample_names: Optional[Collection[str]] = None, iterations: Optional[int] = 1000, min_iter: Optional[int] = 100, min_pairs: Optional[int] = 50) -> DataFrame: """Score samples for each category based on marker pairs. ``data`` can be of type :class:`~anndata.AnnData`, :class:`~pandas.DataFrame` or :class:`~numpy.ndarray` and should contain the raw or normalized gene counts of shape ``n_obs`` * ``n_vars``. Rows correspond to cells and columns to genes. * If a :class:`~anndata.AnnData` object is passed, the category scores and the final prediction will be added to ``data.obs`` with key ``pypairs_{category}_score`` and ``pypairs_max_class``. * If marker pairs contain only the cell cycle categories G1, S and G2M an additional column ``pypairs_cc_prediction`` will be added. Where category S is assigned to samples where G1 and G2M score are below 0.5, as described in [Scialdone15]_. ``marker_pairs``, i.e. output from :func:`~pypairs.tools.sandbag()`, must be a mapping from category to list of 2-tuple Genes: `{'category': [(Gene_1,Gene_2), ...], ...}`. * If no ``marker_pairs`` are passed the default are used from :func:`~pypairs.datasets.default_marker()` based on [Leng15]_ (marker pairs for cell cycle prediction). Parameters ---------- data The (annotated) data matrix of shape ``n_obs`` * ``n_vars``. Rows correspond to cells and columns to genes. marker_pairs A dict mapping from str to a list of 2-tuple, where the key is the category and the list contains the marker pairs: {'Category_1': [(Gene_1, Gene_2), ...], ...}. If not provided default marker pairs are used gene_names Names for genes, must be same length as ``n_vars``. sample_names Names for samples, must be same length as ``n_obs``. iterations An integer specifying the number of iterations for random sampling to obtain a cycle score. Default: 1000 min_iter An integer specifying the minimum number of iterations for score estimation. Default: 100 min_pairs An integer specifying the minimum number of pairs for cycle estimation. Default: 50 Returns ------- A :class:`~pandas.DataFrame` with samples as index and categories as columns with scores for each category for each sample and a additional column with the name of the max scoring category for each sample. * If marker pairs contain only the cell cycle categories G1, S and G2M an additional column ``pypairs_cc_prediction`` will be added. Where category S is assigned to samples where G1 and G2M score are below 0.5, as described in [Scialdone15]_. Examples -------- To predict the cell cycle phase of the unsorted cell from the [Leng15]_ dataset run:: import pypairs import pairs, datasets adata = datasets.leng15('unsorted') marker_pairs = datasets.default_cc_marker() scores = pairs.cyclone(adata, marker_pairs) print(scores) """ logg.info('predicting category scores with cyclone', r=True) if marker_pairs is None: logg.hint( 'no marker pairs passed, using default cell cycle prediction marker' ) marker_pairs = datasets.default_cc_marker() raw_data, gene_names, sample_names = utils.parse_data( data, gene_names, sample_names) # Filter marker pairs to those where both genes are present in `data` marker_pairs, used = filter_marker_pairs(marker_pairs, gene_names) logg.hint('staring processing with {} thread'.format(settings.n_jobs)) raw_data = raw_data.astype(float) get_phase_scores_decorated = utils.parallel_njit(get_phase_scores) scores = { cat: get_phase_scores_decorated(raw_data, iterations, min_iter, min_pairs, pairs, used[cat]) for cat, pairs in marker_pairs.items() } scores_df = DataFrame(scores, columns=marker_pairs.keys()) scores_df.index = sample_names scores_df['max_class'] = scores_df.idxmax(axis=1) if len(marker_pairs.items()) == 3 and all(elem in marker_pairs.keys() for elem in ["G1", "S", "G2M"]): scores_cc = scores_df.loc[:, ["G1", "G2M"]].idxmax(axis=1) scores_df['cc_prediction'] = [ "S" if x < 0.5 else scores_cc[i] for i, x in enumerate(scores_df.loc[:, ["G1", "G2M"]].max( axis=1).values) ] if isinstance(data, AnnData): logg.hint('adding scores with key "pypairs_{category}" to `data.obs`"') logg.hint( 'adding max_class with key "pypairs_max_class" to `data.obs`"') if len(marker_pairs.items()) == 3 and all( elem in marker_pairs.keys() for elem in ["G1", "S", "G2M"]): logg.hint( 'adding cc_prediction with key "pypairs_cc_prediction" to `data.obs`"' ) for name, values in scores_df.iteritems(): key_name = 'pypairs_{}'.format(name) data.obs[key_name] = values logg.info('finished', time=True) return scores_df
def leng15( mode: Optional[str] = 'all', gene_sub: Optional[Iterable[int]] = None, sample_sub: Optional[Iterable[int]] = None ) -> anndata.AnnData: """Single cell RNA-seq data of human hESCs to evaluate Oscope [Leng15]_ Total 213 H1 single cells and 247 H1-Fucci single cells were sequenced. The 213 H1 cells were used to evaluate Oscope in identifying oscillatory genes. The H1-Fucci cells were used to confirm the cell cycle gene cluster identified by Oscope in the H1 hESCs. Normalized expected counts are provided in GSE64016_H1andFUCCI_normalized_EC.csv.gz Reference --------- GEO-Dataset: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE64016 Parameters ---------- mode sample selection mode: - 'all' for all samples, default - 'sorted' for all samples with known cell cycle (G2, S or G1) - 'unsorted' for all samples with unknown cell cycle (H1) gene_sub Index based array of subsetted genes sample_sub Index based array of subsetted samples Returns ------- adata : :class:`~anndata.AnnData` Annotated data matrix containing the normalized gene counts """ filename_cached = "GSE64016_H1andFUCCI_normalized_EC_cached.csv" path_cached = os.path.join(settings.cachedir, filename_cached) if utils.is_cached(filename_cached): x = utils.load_pandas(path_cached) else: filename = os.path.join(os.path.dirname(__file__), 'GSE64016_H1andFUCCI_normalized_EC.csv.gz') with gzip.open(filename, 'r') as fin: x = pd.read_csv(io.TextIOWrapper(fin, newline="")) x.set_index("Unnamed: 0", inplace=True) if settings.cachedir is not None: try: utils.save_pandas(path_cached, x) abs_path = Path(path_cached).absolute() logg.info("cached unzipped leng15 dataset to {}.".format(abs_path)) except IOError as e: logg.warn("could not write to {}.\n Please verify that the path exists and is writable." "Or change `cachedir` via `pypairs.settings.cachedir`".format(settings.cachedir)) logg.warn(str(e)) if mode == 'sorted': x.drop(list(x.filter(regex='H1_')), axis=1, inplace=True) elif mode == 'unsorted': x.drop(list(x.filter(regex='^(?!H1_).*')), axis=1, inplace=True) elif mode == 'all' or mode is None: pass else: raise NotImplementedError("valid options for mode are 'all', 'sorted' or 'unsorted'") if gene_sub: x = x.iloc[gene_sub, :] if sample_sub: x = x.iloc[:, sample_sub] x = x.transpose() adata = anndata.AnnData( x.values, obs={ "category": ["G2M" if s.split("_")[0] == "G2" else s.split("_")[0] for s in list(x.index)] } ) adata.var_names = list(x.columns) adata.obs_names = list(x.index) return adata