Exemple #1
0
def sandbag(adata,
            phases,
            fraction=0.5,
            subset_genes=None,
            subset_samples=None,
            processes=1):
    """Generate pairs of genes [Scialdone15]_.

    Calculates the pairs of genes serving as marker pairs for each phase,
    based on a matrix of gene counts and an annotation of known phases.

    This reproduces the approach in Pairs [Scialdone15]_ and has been
    implemented for Scanpy by Ron Fechtner.

    Parameters
    ----------
    adata : :class:`~scanpy.api.AnnData`
        The annotated data matrix.
    phases : `dict`
        Dictionary of lists, i.e. {phase: [sample, ...]},
        containing annotation of samples to their phase
    fraction : `float`, optional (default: 0.5)
        Fraction to be used as threshold.
    subset_genes : `list` or `None`, optional (default: `None`)
        Genes for sampling the reference set. Default is all genes.
    subset_samples : `list` or `None`, optional (default: `None`)
        Cells for sampling the reference set. Default is all samples.
    processes : `int`, optional (default: 1)
        Number of concurrent processes to be used. 0 = use all available cores.

    Returns
    -------
    `dict` of `list` of `tuple`, i.e.
    {phase: [(Gene1, Gene2), ...]},
    containing marker pairs per phase

    Examples
    --------
    See this `notebook <https://github.com/theislab/scanpy_usage/tree/master/180209_cell_cycle>`_.
    """
    try:
        import pypairs
    except ImportError:
        raise ImportError('You need to install the package `pypairs`.')

    x = pd.DataFrame(adata.X)

    return pypairs.sandbag(x=x,
                           phases=phases,
                           subset_genes=subset_genes,
                           subset_samples=subset_samples,
                           processes=processes)
Exemple #2
0
def sandbag(adata,
            phases,
            fraction=0.5,
            subset_genes=None,
            subset_samples=None,
            n_jobs=1):
    """Generate pairs of genes [Scialdone15]_ [Fechtner18]_.

    Calculates the pairs of genes serving as marker pairs for each phase,
    based on a matrix of gene counts and an annotation of known phases.

    This reproduces the approach of [Scialdone15]_ in the implementation of
    [Fechtner18]_.

    More information and bug reports `here
    <https://github.com/rfechtner/pypairs>`__.

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        The annotated data matrix.
    phases : `dict`
        Dictionary of lists, i.e. {phase: [sample, ...]},
        containing annotation of samples to their phase
    fraction : `float`, optional (default: 0.5)
        Fraction to be used as threshold.
    subset_genes : `list` or `None`, optional (default: `None`)
        Genes for sampling the reference set. Default is all genes.
    subset_samples : `list` or `None`, optional (default: `None`)
        Cells for sampling the reference set. Default is all samples.
    n_jobs : `int`, optional (default: 1)
        Number of concurrent n_jobs to be used. 0 = use all available cores.

    Returns
    -------
    `dict` of `list` of `tuple`, i.e.
    {phase: [(Gene1, Gene2), ...]},
    containing marker pairs per phase
    """
    try:
        import pypairs
    except ImportError:
        raise ImportError('You need to install the package `pypairs`.')

    x = pd.DataFrame(adata.X)

    return pypairs.sandbag(x=x,
                           phases=phases,
                           subset_genes=subset_genes,
                           subset_samples=subset_samples,
                           processes=n_jobs)
Exemple #3
0
def load_ocope_marker(relPath,
                      fraction=0.6,
                      cc_only=True,
                      weighted=False,
                      triplets=False):
    # Load matrix
    gencounts_oscope = pandas.read_csv(
        Path(relPath + "GSE64016_H1andFUCCI_normalized_EC_human.csv"))

    # Set index right
    gencounts_oscope.set_index("Unnamed: 0", inplace=True)

    # Subset sorted
    gencounts_oscope_sorted = gencounts_oscope.iloc[:, [
        gencounts_oscope.columns.get_loc(c) for c in gencounts_oscope.columns
        if "G1_" in c or "G2_" in c or "S_" in c
    ]]

    # Define annotation
    is_G1 = [
        gencounts_oscope_sorted.columns.get_loc(c)
        for c in gencounts_oscope_sorted.columns if "G1_" in c
    ]
    is_S = [
        gencounts_oscope_sorted.columns.get_loc(c)
        for c in gencounts_oscope_sorted.columns if "S_" in c
    ]
    is_G2M = [
        gencounts_oscope_sorted.columns.get_loc(c)
        for c in gencounts_oscope_sorted.columns if "G2_" in c
    ]

    annotation = {"G1": list(is_G1), "S": list(is_S), "G2M": list(is_G2M)}

    go_0007049 = [
        line.replace("\n", "").replace("\r", "")
        for line in open(relPath + "go_0007049_homoSapiens.csv", "r")
    ]
    cycle_base = [
        line.split("\t")[0] for i, line in enumerate(
            open(relPath + "cyclebase_top1000_genes.tsv", "r")) if 0 < i
    ]
    cycle_genes = numpy.unique(numpy.concatenate((go_0007049, cycle_base), 0))

    if cc_only:
        return pairs.sandbag(gencounts_oscope_sorted,
                             phases=annotation,
                             subset_genes=list(cycle_genes),
                             fraction=fraction,
                             processes=10,
                             verbose=True,
                             weighted=weighted,
                             triplets=triplets)
    else:
        return pairs.sandbag(gencounts_oscope_sorted,
                             phases=annotation,
                             fraction=fraction,
                             processes=10,
                             verbose=True,
                             weighted=weighted,
                             triplets=triplets)