Exemple #1
0
def benchmark_test(func, params, runs=3, repeats=3):
    logg.info("Benchmarking ({} runs, {} repeats):".format(runs, repeats))

    settings.verbosity = 0

    times = timeit.Timer(partial(func, **params)).repeat(runs, repeats)
    time_taken = np.array(times) / repeats

    settings.verbosity = 4
    logg.info("\tTotal {total}, mean: {mean}, min {min}".format(
        total=nice_seconds(sum(time_taken)),
        mean=nice_seconds(time_taken.mean()),
        min=nice_seconds(min(time_taken))))
Exemple #2
0
def is_cached(fname):
    if settings.cachedir is None:
        return False

    cached_fname = os.path.join(settings.cachedir, fname)

    if os.path.isdir(settings.cachedir):
        if os.path.isfile(cached_fname):
            return True
        else:
            return False
    else:
        try:
            os.mkdir(settings.cachedir)
            dir_abs = Path(settings.cachedir).absolute()
            logg.info("created specified cache dir: {}".format(dir_abs))
        except OSError:
            logg.warn(
                "could not create specified cache directory: {}.\n No caching will be used for this session."
                " You can change `cachedir` via `pypairs.settings.cachedir`".
                format(settings.cachedir))
            settings.cachedir = None
            return False
Exemple #3
0
def sandbag(
    data: Union[AnnData, DataFrame, np.ndarray, Collection[Collection[float]]],
    annotation: Optional[Mapping[str, Collection[Union[str, int,
                                                       bool]]]] = None,
    gene_names: Optional[Collection[str]] = None,
    sample_names: Optional[Collection[str]] = None,
    fraction: float = 0.65,
    filter_genes: Optional[Collection[Union[str, int, bool]]] = None,
    filter_samples: Optional[Collection[Union[str, int, bool]]] = None
) -> Mapping[str, Collection[Tuple[str, str]]]:
    """
    Calculate 'marker pairs' from a genecount matrix. Cells x Genes.

    A Pair of genes `(g1, g2)` is considered a marker for a category if its expression changes from `g1 > g2`
    in one category to `g1 < g2` in all other categories, for at least a ``fraction`` of cells in this category.

    ``data`` can be of type :class:`~anndata.AnnData`, :class:`~pandas.DataFrame` or :class:`~numpy.ndarray` and should
    contain the raw or normalized gene counts of shape ``n_obs`` * ``n_vars``. Rows correspond to cells and columns to
    genes.

        *
            If data is :class:`~anndata.AnnData` object, the category for each sample should be in in
            ``data.vars['category']``, gene names in ``data.var_names`` and sample names in ``data.obs_names``.

        *
            If data is :class:`~pandas.DataFrame` object, gene names can be in ``df.columns`` or passed via
            ``gene_names`` and sample names in ``df.index`` or passed via ``sample_names``. The category for each
            sample must be passed via ``annotation``.

            *
                ``annotation`` must be in form of `{'category1': ['sample_1','sample_2',...], ...}`. List of samples
                for indexing can be integer, str or a boolean mask of ``len(sample_names)``.

        *
            If data :class:`~numpy.ndarray`, all information must be passed via ``annotation``, ``gene_names`` and
            ``sample_names`` parameters.

    Marker pairs are returned as a mapping from category to list of 2-tuple Genes: `{'category': [(Gene_1,Gene_2), ...],
    ...}`

    Parameters
    ----------

    data
        The (annotated) data matrix of shape ``n_obs`` * ``n_vars``.
        Rows correspond to cells and columns to genes.
    annotation
        Mapping from category to genes. If ``data`` is not :class:`~anndata.AnnData`, this is required.
        List of genes can be index, names or logical mask.
    gene_names
        Names for genes, must be same length as ``n_vars``. If ``data`` is not :class:`~anndata.AnnData`, this is
        required.
    sample_names
        Names for samples, must be same length as ``n_obs``. If ``data`` is not :class:`~anndata.AnnData`, this is
        required.
    fraction
        Fraction of cells per category where marker criteria must be satisfied. Default: 0.65
    filter_genes
        A list of genes to keep. If not ``None`` all genes not in this list will be removed.
        List can be index, names or logical mask.
    filter_samples
         A list of samples to keep. If not ``None`` all samples not in this list will be removed.
         List can be index, names or logical mask.

    Returns
    -------

    marker_pairs_dict
        A dict mapping from str to a list of 2-tuple, where the key is the category and the list contains the marker
        pairs: `{'Category_1': [(Gene_1, Gene_2), ...], ...}`.

    Examples
    --------
        To generate marker pairs for a different fraction
        than the default (0.65) based on the bundled ``oscope``-dataset [Leng15]_ run::

            from pypairs import pairs, datasets

            adata = datasets.leng15()
            marker_pairs = pairs.sandbag(adata, fraction=0.5)

    """
    logg.info('identifying marker pairs with sandbag', r=True)
    logg.hint('sandbag running with fraction of {}'.format(fraction))

    # AnnData or DataFrame or ndarray -> ndarray + meta information
    data, gene_names, sample_names, category_names, categories = utils.parse_data_and_annotation(
        data, annotation, gene_names, sample_names)

    # Get filter mask based on filter selection, and filter out unexpressed genes
    gene_mask, sample_mask = utils.get_filter_masks(data, gene_names,
                                                    sample_names, categories,
                                                    filter_genes,
                                                    filter_samples)

    # Apply mask to gene names and categories, samples are not needed
    gene_names = np.array(gene_names)[gene_mask]
    categories = categories[:, sample_mask]

    # Remove empty categories
    categories, category_names = remove_empty_categories(
        categories, category_names)

    # Cells in category * fraction
    thresholds = calc_thresholds(categories, fraction)

    # Turn array of boolean into array of index
    cats = np.where(categories.T == True)[1]

    data = data.astype(float)

    # Decorate check_pairs according to settings and platform
    check_pairs_decorated = utils.parallel_njit(check_pairs)
    pairs = check_pairs_decorated(data[sample_mask][:, gene_mask], cats,
                                  thresholds)

    # Convert to easier to read dict and return
    marker_pos = np.where(pairs != -1)

    marker_pairs_dict = defaultdict(list)

    for i in range(0, len(marker_pos[0])):
        g1 = marker_pos[0][i]
        g2 = marker_pos[1][i]
        cat = pairs[g1, g2]

        marker_pairs_dict[category_names[cat]].append(
            (gene_names[g1], gene_names[g2]))

    logg.info('finished', time=True)

    # Print count of marker pairs per category
    if settings.verbosity > 2:
        count_total = 0
        count_str = []
        for m, p in marker_pairs_dict.items():
            c = len(p)
            count_total += c
            count_str.append("\t{}: {}".format(m, c))

        logg.hint("found {} marker pairs".format(count_total))
        for s in count_str:
            logg.hint(s)

    return dict(marker_pairs_dict)
Exemple #4
0
def cyclone(data: Union[AnnData, DataFrame, np.ndarray,
                        Collection[Collection[float]]],
            marker_pairs: Optional[Mapping[str,
                                           Collection[Tuple[str,
                                                            str]]]] = None,
            gene_names: Optional[Collection[str]] = None,
            sample_names: Optional[Collection[str]] = None,
            iterations: Optional[int] = 1000,
            min_iter: Optional[int] = 100,
            min_pairs: Optional[int] = 50) -> DataFrame:
    """Score samples for each category based on marker pairs.

    ``data`` can be of type :class:`~anndata.AnnData`, :class:`~pandas.DataFrame` or :class:`~numpy.ndarray` and should
    contain the raw or normalized gene counts of shape ``n_obs`` * ``n_vars``. Rows correspond to cells and columns to
    genes.

        *
            If a :class:`~anndata.AnnData` object is passed, the category scores and the final prediction will be added
            to ``data.obs`` with key ``pypairs_{category}_score`` and ``pypairs_max_class``.

            *
                If marker pairs contain only the cell cycle categories G1, S and G2M an additional column
                ``pypairs_cc_prediction`` will be added. Where category S is assigned to samples where G1 and G2M score
                are below 0.5, as described in [Scialdone15]_.


    ``marker_pairs``, i.e. output from :func:`~pypairs.tools.sandbag()`, must be a mapping from category to list of
    2-tuple Genes: `{'category': [(Gene_1,Gene_2), ...], ...}`.

        *
            If no ``marker_pairs`` are passed the default are used from :func:`~pypairs.datasets.default_marker()`
            based on [Leng15]_ (marker pairs for cell cycle prediction).

    Parameters
    ----------

    data
        The (annotated) data matrix of shape ``n_obs`` * ``n_vars``.
        Rows correspond to cells and columns to genes.
    marker_pairs
        A dict mapping from str to a list of 2-tuple, where the key is the category and the list contains the marker
        pairs: {'Category_1': [(Gene_1, Gene_2), ...], ...}. If not provided default marker pairs are used
    gene_names
        Names for genes, must be same length as ``n_vars``.
    sample_names
        Names for samples, must be same length as ``n_obs``.
    iterations
        An integer specifying the number of iterations for random sampling to obtain a cycle score. Default: 1000
    min_iter
        An integer specifying the minimum number of iterations for score estimation. Default: 100
    min_pairs
        An integer specifying the minimum number of pairs for cycle estimation. Default: 50

    Returns
    -------

    A :class:`~pandas.DataFrame` with samples as index and categories as columns with scores for each category for each
    sample and a additional column with the name of the max scoring category for each sample.

        *
            If marker pairs contain only the cell cycle categories G1, S and G2M an additional column
            ``pypairs_cc_prediction`` will be added. Where category S is assigned to samples where G1 and G2M score are
            below 0.5, as described in [Scialdone15]_.


    Examples
    --------
        To predict the cell cycle phase of the unsorted cell from the [Leng15]_  dataset run::

            import pypairs import pairs, datasets

            adata = datasets.leng15('unsorted')
            marker_pairs = datasets.default_cc_marker()
            scores = pairs.cyclone(adata, marker_pairs)
            print(scores)

    """
    logg.info('predicting category scores with cyclone', r=True)

    if marker_pairs is None:
        logg.hint(
            'no marker pairs passed, using default cell cycle prediction marker'
        )
        marker_pairs = datasets.default_cc_marker()

    raw_data, gene_names, sample_names = utils.parse_data(
        data, gene_names, sample_names)

    # Filter marker pairs to those where both genes are present in `data`
    marker_pairs, used = filter_marker_pairs(marker_pairs, gene_names)

    logg.hint('staring processing with {} thread'.format(settings.n_jobs))

    raw_data = raw_data.astype(float)

    get_phase_scores_decorated = utils.parallel_njit(get_phase_scores)

    scores = {
        cat: get_phase_scores_decorated(raw_data, iterations, min_iter,
                                        min_pairs, pairs, used[cat])
        for cat, pairs in marker_pairs.items()
    }

    scores_df = DataFrame(scores, columns=marker_pairs.keys())
    scores_df.index = sample_names
    scores_df['max_class'] = scores_df.idxmax(axis=1)

    if len(marker_pairs.items()) == 3 and all(elem in marker_pairs.keys()
                                              for elem in ["G1", "S", "G2M"]):
        scores_cc = scores_df.loc[:, ["G1", "G2M"]].idxmax(axis=1)
        scores_df['cc_prediction'] = [
            "S" if x < 0.5 else scores_cc[i]
            for i, x in enumerate(scores_df.loc[:, ["G1", "G2M"]].max(
                axis=1).values)
        ]

    if isinstance(data, AnnData):
        logg.hint('adding scores with key "pypairs_{category}" to `data.obs`"')
        logg.hint(
            'adding max_class with key "pypairs_max_class" to `data.obs`"')
        if len(marker_pairs.items()) == 3 and all(
                elem in marker_pairs.keys() for elem in ["G1", "S", "G2M"]):
            logg.hint(
                'adding cc_prediction with key "pypairs_cc_prediction" to `data.obs`"'
            )

        for name, values in scores_df.iteritems():
            key_name = 'pypairs_{}'.format(name)
            data.obs[key_name] = values

    logg.info('finished', time=True)
    return scores_df
Exemple #5
0
def leng15(
    mode: Optional[str] = 'all',
    gene_sub: Optional[Iterable[int]] = None,
    sample_sub: Optional[Iterable[int]] = None
) -> anndata.AnnData:
    """Single cell RNA-seq data of human hESCs to evaluate Oscope [Leng15]_

    Total 213 H1 single cells and 247 H1-Fucci single cells were sequenced.
    The 213 H1 cells were used to evaluate Oscope in identifying oscillatory genes.
    The H1-Fucci cells were used to confirm the cell cycle gene cluster identified
    by Oscope in the H1 hESCs.
    Normalized expected counts are provided in GSE64016_H1andFUCCI_normalized_EC.csv.gz

    Reference
    ---------
        GEO-Dataset: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE64016

    Parameters
    ----------
    mode
        sample selection mode:
            - 'all' for all samples, default
            - 'sorted' for all samples with known cell cycle (G2, S or G1)
            - 'unsorted' for all samples with unknown cell cycle (H1)
    gene_sub
        Index based array of subsetted genes
    sample_sub
        Index based array of subsetted samples

    Returns
    -------
    adata : :class:`~anndata.AnnData`
        Annotated data matrix containing the normalized gene counts
    """

    filename_cached = "GSE64016_H1andFUCCI_normalized_EC_cached.csv"
    path_cached = os.path.join(settings.cachedir, filename_cached)

    if utils.is_cached(filename_cached):
        x = utils.load_pandas(path_cached)
    else:
        filename = os.path.join(os.path.dirname(__file__), 'GSE64016_H1andFUCCI_normalized_EC.csv.gz')

        with gzip.open(filename, 'r') as fin:
            x = pd.read_csv(io.TextIOWrapper(fin, newline=""))

        x.set_index("Unnamed: 0", inplace=True)

        if settings.cachedir is not None:
            try:
                utils.save_pandas(path_cached, x)
                abs_path = Path(path_cached).absolute()
                logg.info("cached unzipped leng15 dataset to {}.".format(abs_path))
            except IOError as e:
                logg.warn("could not write to {}.\n Please verify that the path exists and is writable."
                          "Or change `cachedir` via `pypairs.settings.cachedir`".format(settings.cachedir))
                logg.warn(str(e))
                
    if mode == 'sorted':
        x.drop(list(x.filter(regex='H1_')), axis=1, inplace=True)
    elif mode == 'unsorted':
        x.drop(list(x.filter(regex='^(?!H1_).*')), axis=1, inplace=True)
    elif mode == 'all' or mode is None:
        pass
    else:
        raise NotImplementedError("valid options for mode are 'all', 'sorted' or 'unsorted'")

    if gene_sub:
        x = x.iloc[gene_sub, :]

    if sample_sub:
        x = x.iloc[:, sample_sub]

    x = x.transpose()

    adata = anndata.AnnData(
        x.values,
        obs={
            "category":
                ["G2M" if s.split("_")[0] == "G2" else s.split("_")[0] for s in list(x.index)]
        }
    )

    adata.var_names = list(x.columns)
    adata.obs_names = list(x.index)

    return adata