Example #1
0
def export_marker(marker: Mapping[str, Iterable[Tuple[str, str]]],
                  fname: str,
                  defaultpath: Optional[bool] = True):
    """Export marker pairs to json-File.

    Parameters
    ----------
    marker
        Marker pairs as from :func:`~pypairs.pairs.sandbag`
    fname
        Name of the json-File in the writedir (see settings)
    defaultpath
        Use settings.writedir as root. Default: True
    """

    if defaultpath:
        fpath = settings.writedir + fname
    else:
        fpath = fname

    try:
        write_dict_to_json(marker, fpath)
        logg.hint("marker pairs written to: " + str(fpath))
    except IOError as e:
        msg = "could not write to {}.".format(fpath) + \
              "Please verify that the path exists and is writable. Or change `writedir` via `pypairs.settings.writedir`"
        logg.error(msg)
        logg.error(str(e))
Example #2
0
def filter_unexpressed_genes(data, gene_names):
    mask = np.invert(np.all(data == 0, axis=0))
    x = data[:, mask]
    gene_names = np.array(gene_names)[mask]

    if sum(mask) != len(mask):
        logg.hint(
            "filtered out {} unexpressed genes".format(len(mask) - sum(mask)))

    return x, list(gene_names)
Example #3
0
def filter_marker_pairs(marker_pairs, gene_names):
    gene_name_to_idx = {g: i for i, g in enumerate(gene_names)}
    removed = 0

    marker_pairs_idx = {}

    used_masks = {}
    for cat, pairs in marker_pairs.items():
        used_mask = np.zeros(len(gene_names), dtype='bool')
        for pair in pairs:
            try:
                g1_idx = gene_name_to_idx[pair[0]]
                g2_idx = gene_name_to_idx[pair[1]]

                used_mask[g1_idx] = True
                used_mask[g2_idx] = True
            except KeyError:
                removed += 1
        used_masks[cat] = used_mask
        used_idx = np.where(used_mask)[0].tolist()

        new_idx = {u: i for i, u in enumerate(used_idx)}

        new_pairs_idx = []
        for pair in pairs:
            try:
                g1_idx = gene_name_to_idx[pair[0]]
                g2_idx = gene_name_to_idx[pair[1]]

                new_pairs_idx.append([new_idx[g1_idx], new_idx[g2_idx]])
            except KeyError:
                #logg.hint("genepair ({}, {}) not present in dataset".format(pair[0], pair[1]))
                # producing to much output..
                pass

        marker_pairs_idx[cat] = np.array(new_pairs_idx)

    logg.hint("translated marker pairs, {} removed".format(removed))
    return marker_pairs_idx, used_masks
Example #4
0
def get_filter_masks(data, gene_names, sample_names, categories, filter_genes,
                     filter_samples):
    dim_befor_filter = data.shape
    filtered = False

    unexpressed_genes = np.invert(np.all(data == 0, axis=0))
    filtered_genes = to_boolean_mask(filter_genes, gene_names)
    gene_mask = np.logical_and(unexpressed_genes, filtered_genes)

    filtered_samples = to_boolean_mask(filter_samples, sample_names)
    not_categorized = categories[0]
    for i in range(1, categories.shape[0]):
        not_categorized = np.logical_or(not_categorized, categories[i])
    sample_mask = np.logical_and(not_categorized, filtered_samples)

    if filtered:
        logg.hint(
            "filtered out {} samples and {} genes based on passed subsets".
            format(dim_befor_filter[0] - np.sum(sample_mask),
                   dim_befor_filter[1] - np.sum(gene_mask)))
        logg.hint("new data is of shape {} x {}".format(*data.shape))

    return gene_mask, sample_mask
Example #5
0
def parse_data(
    data: Union[AnnData, DataFrame, np.ndarray],
    gene_names: Optional[Iterable[str]] = None,
    sample_names: Optional[Iterable[str]] = None
) -> Tuple[np.ndarray, list, list]:
    """Reduces :class:`~anndata.AnnData` and :class:`~pandas.DataFrame` to a :class:`~numpy.dnarray` and extracts
    `gene_names` and `sample_names` from index and column names."""
    if isinstance(data, AnnData):
        if sample_names is None:
            sample_names = list(data.obs_names)

        if gene_names is None:
            gene_names = list(data.var_names)

        raw_data = data.X
    else:
        if isinstance(data, DataFrame):
            if gene_names is None:
                gene_names = list(data.columns)
            if sample_names is None:
                sample_names = list(data.index)

            raw_data = data.values
        elif isinstance(data, np.ndarray):
            if gene_names is None or sample_names is None:
                raise ValueError(
                    "Provide gene names and sample names in ``gene_names`` and ``sample_names``"
                )

            raw_data = data
        else:
            raise ValueError(
                "data can only be of type AnnData, DataFrame or ndarray")

    logg.hint("passed data of shape {} x {} (samples x genes)".format(
        *raw_data.shape))
    return raw_data, gene_names, sample_names
Example #6
0
def load_marker(fname: str, defaultpath: Optional[bool] = True):
    """Export marker pairs to json-File.

    Parameters
    ----------
    fname
        Name of the json-File to write to
    defaultpath
        Use settings.writedir as root. Default: True
    """

    if defaultpath:
        fpath = settings.writedir + fname
    else:
        fpath = fname

    try:
        marker = read_dict_from_json(fpath)
    except IOError:
        logg.error(
            "could not read from {}.\n Please verify that the path exists and is writable."
            .format(fpath))
        return None

    if settings.verbosity > 2:
        count_total = 0
        count_str = []
        for m, p in marker.items():
            c = len(p)
            count_total += c
            count_str.append("\t{}: {}".format(m, c))

        logg.hint("loaded {} marker pairs".format(count_total))
        for s in count_str:
            logg.hint(s)

    return marker
Example #7
0
def parallel_njit(func: Callable[[], Any],
                  jitted: Optional[bool] = True) -> Callable[[], Any]:
    """Dynamic decorator for jit-compiled functions.
    Adds parallel=True if settings.n_jobs > 1
    """
    if jitted is False or settings.enable_jit is False:
        logg.warn(
            'staring uncompiled processing. Should only be used for debug and testing!'
        )
        return func

    if settings.n_jobs > 1:
        if is_win32() is False:
            logg.hint('staring parallel processing with {} threads'.format(
                settings.n_jobs))
            return njit(func, parallel=True, fastmath=settings.enable_fastmath)
        else:
            logg.error(
                'n_jobs is set to {} but multiprocessing is not supported for your platform! '
                'falling back to single core... '.format(settings.n_jobs))
            return njit(func, fastmath=settings.enable_fastmath)
    else:
        logg.hint('staring processing with 1 thread')
        return njit(func, fastmath=settings.enable_fastmath)
Example #8
0
def parse_data_and_annotation(
    data: Union[AnnData, DataFrame, np.ndarray, Iterable[Iterable[float]]],
    annotation: Optional[Mapping[str, Iterable[Union[str, int, bool]]]] = None,
    gene_names: Optional[Iterable[str]] = None,
    sample_names: Optional[Iterable[str]] = None
) -> Tuple[np.ndarray, list, list, np.ndarray, np.ndarray]:
    raw_data, gene_names, sample_names = parse_data(data, gene_names,
                                                    sample_names)

    if isinstance(data, AnnData):
        if annotation:
            category_names, categories = parse_annotation(
                annotation, sample_names)
        else:
            if 'category' in data.obs_keys():
                category_names = np.unique(data.obs['category'])

                categories = np.ndarray(shape=(len(category_names),
                                               len(sample_names)),
                                        dtype=bool)

                logg.hint("passed {} categories: {}".format(
                    len(category_names), str(category_names)))
                for i, name in enumerate(category_names):
                    categories[i] = np.isin(data.obs['category'], name)
                    if type(categories[i][0]) == bool or type(
                            categories[i][0]) == np.bool_:
                        logg.hint("\t{}: {}".format(name, sum(categories[i])))
                    else:
                        logg.hint("\t{}: {}".format(name, len(categories[i])))

            else:
                raise ValueError(
                    "Provide categories as data.var['category'] or in ``annotation``"
                )
    else:
        if annotation:
            category_names, categories = parse_annotation(
                annotation, sample_names)
        else:
            raise ValueError("Provide categories in ``annotation``")

    return raw_data, gene_names, sample_names, category_names, categories
Example #9
0
def parse_annotation(
        annotation: Mapping[str, Iterable[Union[str, int, bool]]],
        sample_names: Iterable[str]) -> Tuple[np.ndarray, np.ndarray]:
    """ Translates a dictionary annotation {'category': [sample1, sample2, ...], ..} into a list of boolean masks.
    Accepts index, names and boolean."""
    category_names = np.array(list(annotation.keys()))
    categories = np.ndarray(shape=(len(category_names), len(sample_names)),
                            dtype=bool)

    logg.hint("passed {} categories: {}".format(len(category_names),
                                                str(category_names)))
    for i, k in enumerate(annotation.keys()):
        if type(annotation[k][0]) == bool or type(
                annotation[k][0]) == np.bool_:
            logg.hint("\t{}: {}".format(k, sum(annotation[k])))
        else:
            logg.hint("\t{}: {}".format(k, len(annotation[k])))

        categories[i] = to_boolean_mask(np.array(annotation[k]), sample_names)

    return category_names, categories
Example #10
0
def sandbag(
    data: Union[AnnData, DataFrame, np.ndarray, Collection[Collection[float]]],
    annotation: Optional[Mapping[str, Collection[Union[str, int,
                                                       bool]]]] = None,
    gene_names: Optional[Collection[str]] = None,
    sample_names: Optional[Collection[str]] = None,
    fraction: float = 0.65,
    filter_genes: Optional[Collection[Union[str, int, bool]]] = None,
    filter_samples: Optional[Collection[Union[str, int, bool]]] = None
) -> Mapping[str, Collection[Tuple[str, str]]]:
    """
    Calculate 'marker pairs' from a genecount matrix. Cells x Genes.

    A Pair of genes `(g1, g2)` is considered a marker for a category if its expression changes from `g1 > g2`
    in one category to `g1 < g2` in all other categories, for at least a ``fraction`` of cells in this category.

    ``data`` can be of type :class:`~anndata.AnnData`, :class:`~pandas.DataFrame` or :class:`~numpy.ndarray` and should
    contain the raw or normalized gene counts of shape ``n_obs`` * ``n_vars``. Rows correspond to cells and columns to
    genes.

        *
            If data is :class:`~anndata.AnnData` object, the category for each sample should be in in
            ``data.vars['category']``, gene names in ``data.var_names`` and sample names in ``data.obs_names``.

        *
            If data is :class:`~pandas.DataFrame` object, gene names can be in ``df.columns`` or passed via
            ``gene_names`` and sample names in ``df.index`` or passed via ``sample_names``. The category for each
            sample must be passed via ``annotation``.

            *
                ``annotation`` must be in form of `{'category1': ['sample_1','sample_2',...], ...}`. List of samples
                for indexing can be integer, str or a boolean mask of ``len(sample_names)``.

        *
            If data :class:`~numpy.ndarray`, all information must be passed via ``annotation``, ``gene_names`` and
            ``sample_names`` parameters.

    Marker pairs are returned as a mapping from category to list of 2-tuple Genes: `{'category': [(Gene_1,Gene_2), ...],
    ...}`

    Parameters
    ----------

    data
        The (annotated) data matrix of shape ``n_obs`` * ``n_vars``.
        Rows correspond to cells and columns to genes.
    annotation
        Mapping from category to genes. If ``data`` is not :class:`~anndata.AnnData`, this is required.
        List of genes can be index, names or logical mask.
    gene_names
        Names for genes, must be same length as ``n_vars``. If ``data`` is not :class:`~anndata.AnnData`, this is
        required.
    sample_names
        Names for samples, must be same length as ``n_obs``. If ``data`` is not :class:`~anndata.AnnData`, this is
        required.
    fraction
        Fraction of cells per category where marker criteria must be satisfied. Default: 0.65
    filter_genes
        A list of genes to keep. If not ``None`` all genes not in this list will be removed.
        List can be index, names or logical mask.
    filter_samples
         A list of samples to keep. If not ``None`` all samples not in this list will be removed.
         List can be index, names or logical mask.

    Returns
    -------

    marker_pairs_dict
        A dict mapping from str to a list of 2-tuple, where the key is the category and the list contains the marker
        pairs: `{'Category_1': [(Gene_1, Gene_2), ...], ...}`.

    Examples
    --------
        To generate marker pairs for a different fraction
        than the default (0.65) based on the bundled ``oscope``-dataset [Leng15]_ run::

            from pypairs import pairs, datasets

            adata = datasets.leng15()
            marker_pairs = pairs.sandbag(adata, fraction=0.5)

    """
    logg.info('identifying marker pairs with sandbag', r=True)
    logg.hint('sandbag running with fraction of {}'.format(fraction))

    # AnnData or DataFrame or ndarray -> ndarray + meta information
    data, gene_names, sample_names, category_names, categories = utils.parse_data_and_annotation(
        data, annotation, gene_names, sample_names)

    # Get filter mask based on filter selection, and filter out unexpressed genes
    gene_mask, sample_mask = utils.get_filter_masks(data, gene_names,
                                                    sample_names, categories,
                                                    filter_genes,
                                                    filter_samples)

    # Apply mask to gene names and categories, samples are not needed
    gene_names = np.array(gene_names)[gene_mask]
    categories = categories[:, sample_mask]

    # Remove empty categories
    categories, category_names = remove_empty_categories(
        categories, category_names)

    # Cells in category * fraction
    thresholds = calc_thresholds(categories, fraction)

    # Turn array of boolean into array of index
    cats = np.where(categories.T == True)[1]

    data = data.astype(float)

    # Decorate check_pairs according to settings and platform
    check_pairs_decorated = utils.parallel_njit(check_pairs)
    pairs = check_pairs_decorated(data[sample_mask][:, gene_mask], cats,
                                  thresholds)

    # Convert to easier to read dict and return
    marker_pos = np.where(pairs != -1)

    marker_pairs_dict = defaultdict(list)

    for i in range(0, len(marker_pos[0])):
        g1 = marker_pos[0][i]
        g2 = marker_pos[1][i]
        cat = pairs[g1, g2]

        marker_pairs_dict[category_names[cat]].append(
            (gene_names[g1], gene_names[g2]))

    logg.info('finished', time=True)

    # Print count of marker pairs per category
    if settings.verbosity > 2:
        count_total = 0
        count_str = []
        for m, p in marker_pairs_dict.items():
            c = len(p)
            count_total += c
            count_str.append("\t{}: {}".format(m, c))

        logg.hint("found {} marker pairs".format(count_total))
        for s in count_str:
            logg.hint(s)

    return dict(marker_pairs_dict)
Example #11
0
def cyclone(data: Union[AnnData, DataFrame, np.ndarray,
                        Collection[Collection[float]]],
            marker_pairs: Optional[Mapping[str,
                                           Collection[Tuple[str,
                                                            str]]]] = None,
            gene_names: Optional[Collection[str]] = None,
            sample_names: Optional[Collection[str]] = None,
            iterations: Optional[int] = 1000,
            min_iter: Optional[int] = 100,
            min_pairs: Optional[int] = 50) -> DataFrame:
    """Score samples for each category based on marker pairs.

    ``data`` can be of type :class:`~anndata.AnnData`, :class:`~pandas.DataFrame` or :class:`~numpy.ndarray` and should
    contain the raw or normalized gene counts of shape ``n_obs`` * ``n_vars``. Rows correspond to cells and columns to
    genes.

        *
            If a :class:`~anndata.AnnData` object is passed, the category scores and the final prediction will be added
            to ``data.obs`` with key ``pypairs_{category}_score`` and ``pypairs_max_class``.

            *
                If marker pairs contain only the cell cycle categories G1, S and G2M an additional column
                ``pypairs_cc_prediction`` will be added. Where category S is assigned to samples where G1 and G2M score
                are below 0.5, as described in [Scialdone15]_.


    ``marker_pairs``, i.e. output from :func:`~pypairs.tools.sandbag()`, must be a mapping from category to list of
    2-tuple Genes: `{'category': [(Gene_1,Gene_2), ...], ...}`.

        *
            If no ``marker_pairs`` are passed the default are used from :func:`~pypairs.datasets.default_marker()`
            based on [Leng15]_ (marker pairs for cell cycle prediction).

    Parameters
    ----------

    data
        The (annotated) data matrix of shape ``n_obs`` * ``n_vars``.
        Rows correspond to cells and columns to genes.
    marker_pairs
        A dict mapping from str to a list of 2-tuple, where the key is the category and the list contains the marker
        pairs: {'Category_1': [(Gene_1, Gene_2), ...], ...}. If not provided default marker pairs are used
    gene_names
        Names for genes, must be same length as ``n_vars``.
    sample_names
        Names for samples, must be same length as ``n_obs``.
    iterations
        An integer specifying the number of iterations for random sampling to obtain a cycle score. Default: 1000
    min_iter
        An integer specifying the minimum number of iterations for score estimation. Default: 100
    min_pairs
        An integer specifying the minimum number of pairs for cycle estimation. Default: 50

    Returns
    -------

    A :class:`~pandas.DataFrame` with samples as index and categories as columns with scores for each category for each
    sample and a additional column with the name of the max scoring category for each sample.

        *
            If marker pairs contain only the cell cycle categories G1, S and G2M an additional column
            ``pypairs_cc_prediction`` will be added. Where category S is assigned to samples where G1 and G2M score are
            below 0.5, as described in [Scialdone15]_.


    Examples
    --------
        To predict the cell cycle phase of the unsorted cell from the [Leng15]_  dataset run::

            import pypairs import pairs, datasets

            adata = datasets.leng15('unsorted')
            marker_pairs = datasets.default_cc_marker()
            scores = pairs.cyclone(adata, marker_pairs)
            print(scores)

    """
    logg.info('predicting category scores with cyclone', r=True)

    if marker_pairs is None:
        logg.hint(
            'no marker pairs passed, using default cell cycle prediction marker'
        )
        marker_pairs = datasets.default_cc_marker()

    raw_data, gene_names, sample_names = utils.parse_data(
        data, gene_names, sample_names)

    # Filter marker pairs to those where both genes are present in `data`
    marker_pairs, used = filter_marker_pairs(marker_pairs, gene_names)

    logg.hint('staring processing with {} thread'.format(settings.n_jobs))

    raw_data = raw_data.astype(float)

    get_phase_scores_decorated = utils.parallel_njit(get_phase_scores)

    scores = {
        cat: get_phase_scores_decorated(raw_data, iterations, min_iter,
                                        min_pairs, pairs, used[cat])
        for cat, pairs in marker_pairs.items()
    }

    scores_df = DataFrame(scores, columns=marker_pairs.keys())
    scores_df.index = sample_names
    scores_df['max_class'] = scores_df.idxmax(axis=1)

    if len(marker_pairs.items()) == 3 and all(elem in marker_pairs.keys()
                                              for elem in ["G1", "S", "G2M"]):
        scores_cc = scores_df.loc[:, ["G1", "G2M"]].idxmax(axis=1)
        scores_df['cc_prediction'] = [
            "S" if x < 0.5 else scores_cc[i]
            for i, x in enumerate(scores_df.loc[:, ["G1", "G2M"]].max(
                axis=1).values)
        ]

    if isinstance(data, AnnData):
        logg.hint('adding scores with key "pypairs_{category}" to `data.obs`"')
        logg.hint(
            'adding max_class with key "pypairs_max_class" to `data.obs`"')
        if len(marker_pairs.items()) == 3 and all(
                elem in marker_pairs.keys() for elem in ["G1", "S", "G2M"]):
            logg.hint(
                'adding cc_prediction with key "pypairs_cc_prediction" to `data.obs`"'
            )

        for name, values in scores_df.iteritems():
            key_name = 'pypairs_{}'.format(name)
            data.obs[key_name] = values

    logg.info('finished', time=True)
    return scores_df